forked from colbymchenry/codegraph
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquery-utils.ts
More file actions
342 lines (302 loc) · 11.8 KB
/
query-utils.ts
File metadata and controls
342 lines (302 loc) · 11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
/**
* Search Query Utilities
*
* Shared module for search term extraction and scoring.
*/
import * as path from 'path';
import { Node } from '../types';
/**
* Common stop words to filter from search queries.
* Includes generic English + code-specific noise words.
*/
export const STOP_WORDS = new Set([
// English
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'from', 'is', 'it', 'that', 'this', 'are', 'was',
'be', 'has', 'had', 'have', 'do', 'does', 'did', 'will', 'would', 'could',
'should', 'may', 'might', 'can', 'shall', 'not', 'no', 'all', 'each',
'every', 'how', 'what', 'where', 'when', 'who', 'which', 'why',
'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
'show', 'give', 'tell',
'been', 'done', 'made', 'used', 'using', 'work', 'works', 'found',
'also', 'into', 'then', 'than', 'just', 'more', 'some', 'such',
'over', 'only', 'out', 'its', 'so', 'up', 'as', 'if',
'look', 'need', 'needs', 'want', 'happen', 'happens',
'affect', 'affected', 'break', 'breaks', 'failing',
'implemented', 'implement',
// Code-specific noise (avoid filtering common symbol names like get/set/add/build/find/list)
'code', 'file', 'files', 'function', 'method', 'class', 'type',
'fix', 'bug', 'called',
]);
/**
* Generate stem variants of a search term by removing common English suffixes.
* Used for FTS query expansion so "caching" also finds "cache", "eviction" finds "evict", etc.
* Stems are used as PREFIX matches in FTS, so they don't need to be perfect English words.
*/
export function getStemVariants(term: string): string[] {
const variants = new Set<string>();
const t = term.toLowerCase();
// -ing: caching→cach/cache, handling→handl/handle, running→run
if (t.endsWith('ing') && t.length > 5) {
const base = t.slice(0, -3);
variants.add(base);
variants.add(base + 'e');
if (base.length >= 2 && base[base.length - 1] === base[base.length - 2]) {
variants.add(base.slice(0, -1));
}
}
// -tion/-sion: eviction→evict, expression→express
if ((t.endsWith('tion') || t.endsWith('sion')) && t.length > 5) {
variants.add(t.slice(0, -3));
}
// -ment: management→manage
if (t.endsWith('ment') && t.length > 6) {
variants.add(t.slice(0, -4));
}
// -ies: entries→entry
if (t.endsWith('ies') && t.length > 4) {
variants.add(t.slice(0, -3) + 'y');
}
// -es: processes→process, classes→class
else if (t.endsWith('es') && t.length > 4) {
variants.add(t.slice(0, -2));
}
// -s: errors→error (skip -ss endings like "class")
else if (t.endsWith('s') && !t.endsWith('ss') && t.length > 4) {
variants.add(t.slice(0, -1));
}
// -ed: handled→handle, propagated→propagate, carried→carry
if (t.endsWith('ed') && !t.endsWith('eed') && t.length > 4) {
variants.add(t.slice(0, -1));
variants.add(t.slice(0, -2));
if (t.endsWith('ied') && t.length > 5) {
variants.add(t.slice(0, -3) + 'y');
}
}
// -er: builder→build/builde, handler→handl/handle, getter→get
if (t.endsWith('er') && t.length > 4) {
const base = t.slice(0, -2);
variants.add(base);
variants.add(base + 'e');
if (base.length >= 2 && base[base.length - 1] === base[base.length - 2]) {
variants.add(base.slice(0, -1));
}
}
return [...variants].filter(v => v.length >= 3 && v !== t);
}
/**
* Extract meaningful search terms from a natural language query.
* Splits camelCase, PascalCase, snake_case, SCREAMING_SNAKE, and dot.notation
* into individual tokens before filtering.
*
* Preserves original compound identifiers (e.g., "scrapeLoop") alongside
* their split parts so that FTS can match both the full symbol name and
* individual words within it.
*
* Also generates stem variants (e.g., "caching"→"cache", "eviction"→"evict")
* so FTS prefix matching can find related code symbols.
*/
export function extractSearchTerms(query: string, options?: { stems?: boolean }): string[] {
const includeStems = options?.stems !== false;
const tokens = new Set<string>();
// First, extract and preserve compound identifiers before splitting
// CamelCase: scrapeLoop, UserService, getCallGraph
const compoundPattern = /\b([a-zA-Z][a-zA-Z0-9]*(?:[A-Z][a-z]+)+|[A-Z][a-z]+(?:[A-Z][a-z]*)+)\b/g;
let match;
while ((match = compoundPattern.exec(query)) !== null) {
if (match[1] && match[1].length >= 3) {
tokens.add(match[1].toLowerCase()); // preserve full compound: "scrapeloop"
}
}
// snake_case: scrape_loop, user_service
const snakePattern = /\b([a-zA-Z][a-zA-Z0-9]*(?:_[a-zA-Z0-9]+)+)\b/g;
while ((match = snakePattern.exec(query)) !== null) {
if (match[1] && match[1].length >= 3) {
tokens.add(match[1].toLowerCase());
}
}
// Split camelCase / PascalCase: "getUserName" → "get User Name"
const camelSplit = query
.replace(/([a-z])([A-Z])/g, '$1 $2')
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2');
// Replace underscores and dots with spaces (snake_case, dot.notation)
const normalised = camelSplit.replace(/[_.]+/g, ' ');
// Split on any non-alphanumeric character
const words = normalised.split(/[^a-zA-Z0-9]+/).filter(Boolean);
for (const word of words) {
const lower = word.toLowerCase();
if (lower.length < 3) continue;
if (STOP_WORDS.has(lower)) continue;
tokens.add(lower);
}
// Generate stem variants for broader FTS matching.
// "caching" → "cache" finds CacheBuilder; "eviction" → "evict" finds evictEntries.
// Also enables co-occurrence dampening by increasing term count above 1.
// Stems are skipped when scoring path relevance (stems inflate path scores).
if (includeStems) {
const stems = new Set<string>();
for (const token of tokens) {
for (const variant of getStemVariants(token)) {
if (!tokens.has(variant) && !STOP_WORDS.has(variant)) {
stems.add(variant);
}
}
}
for (const stem of stems) {
tokens.add(stem);
}
}
return [...tokens];
}
/**
* Score path relevance to a query
* Higher score = more relevant path
*/
export function scorePathRelevance(filePath: string, query: string): number {
// Use base terms only — stem variants inflate path scores by generating
// many near-duplicate terms that all match the same path segments.
const terms = extractSearchTerms(query, { stems: false });
if (terms.length === 0) return 0;
const pathLower = filePath.toLowerCase();
const fileName = path.basename(filePath).toLowerCase();
const dirName = path.dirname(filePath).toLowerCase();
let score = 0;
for (const term of terms) {
// Exact filename match (strongest)
if (fileName.includes(term)) score += 10;
// Directory match
if (dirName.includes(term)) score += 5;
// General path match
else if (pathLower.includes(term)) score += 3;
}
// Deprioritize test files unless the query is explicitly about tests
const queryLower = query.toLowerCase();
const isTestQuery = queryLower.includes('test') || queryLower.includes('spec');
if (!isTestQuery && isTestFile(filePath)) {
score -= 15;
}
return score;
}
/**
* Check if a file path looks like a test file
*/
export function isTestFile(filePath: string): boolean {
const lower = filePath.toLowerCase();
const fileName = path.basename(filePath); // original case — needed for camelCase boundaries
const lowerName = fileName.toLowerCase();
// --- Filename patterns ---
if (
lowerName.startsWith('test_') || // python: test_foo.py
lowerName.startsWith('test.') ||
// separator-delimited: foo_test.go, foo.test.ts, foo-spec.rb, bar_spec.py
/[._-](test|tests|spec|specs)\.[a-z0-9]+$/.test(lowerName) ||
// CamelCase suffix (Java/Kotlin/Swift/C#/Scala): FooTest.kt, BarTests.swift,
// BazSpec.scala, QuxTestCase.java. Capital-led so "latest.kt"/"manifest.kt"
// (lowercase "test") are NOT matched.
/(?:Test|Tests|TestCase|Tester|Spec|Specs)\.[A-Za-z0-9]+$/.test(fileName)
) {
return true;
}
// --- Directory patterns ---
if (
lower.includes('/tests/') || lower.includes('/test/') ||
lower.includes('/__tests__/') || lower.includes('/spec/') ||
lower.includes('/specs/') || lower.includes('/testlib/') ||
lower.includes('/testing/') ||
lower.startsWith('test/') || lower.startsWith('tests/') ||
lower.startsWith('spec/') || lower.startsWith('specs/') ||
// CamelCase test source-set dirs (Kotlin Multiplatform / Gradle / Xcode):
// jvmTest/, commonTest/, androidTest/, iosTest/, integrationTest/. Capital-led
// so "latest/" / "manifest/" are not matched.
/(?:^|\/)[A-Za-z0-9]*(?:Test|Tests|Spec)\//.test(filePath)
) {
return true;
}
// Non-production directories: examples, samples, benchmarks, fixtures, demos.
// Check both mid-path (/integration/) and start-of-path (integration/) since
// file paths may be stored as relative paths without a leading slash.
return matchesNonProductionDir(lower);
}
/**
* Check if a path is in a non-production directory (integration, sample, example, etc.)
* Handles both absolute paths (/foo/integration/bar) and relative paths (integration/bar).
*/
function matchesNonProductionDir(lowerPath: string): boolean {
const dirs = [
'integration', 'sample', 'samples', 'example', 'examples',
'fixture', 'fixtures', 'benchmark', 'benchmarks', 'demo', 'demos',
];
for (const dir of dirs) {
if (lowerPath.includes('/' + dir + '/') || lowerPath.startsWith(dir + '/')) {
return true;
}
}
return false;
}
/**
* Bonus when a node's name matches the search query.
* Exact matches get the largest boost; prefix matches get smaller boosts.
* Multi-word queries also check individual term matches against the name.
*/
export function nameMatchBonus(nodeName: string, query: string): number {
const nameLower = nodeName.toLowerCase();
// Split query into word-level terms (handles "CacheBuilder build" → ["cache","builder","build"])
const rawTerms = query
.replace(/([a-z])([A-Z])/g, '$1 $2')
.split(/[\s_.\-]+/)
.map(t => t.toLowerCase())
.filter(t => t.length >= 2);
// Also keep original space-separated tokens for exact-term matching
const queryTokens = query.split(/\s+/).map(t => t.toLowerCase()).filter(t => t.length >= 2);
// Full query as a single token (for compound identifiers like "CacheBuilder")
const queryLower = query.replace(/[\s]+/g, '').toLowerCase();
// Exact match: query exactly equals the node name
if (nameLower === queryLower) return 80;
// Exact match on a query token: "CacheBuilder build" and node name is "build"
if (queryTokens.length > 1 && queryTokens.includes(nameLower)) return 60;
// Name starts with query — scale by length ratio so "Pod"→"Pod" (exact, handled above)
// scores much higher than "Pod"→"PodGCControllerOptions" (ratio 0.125).
if (nameLower.startsWith(queryLower)) {
const ratio = queryLower.length / nameLower.length;
return Math.round(10 + 30 * ratio);
}
// All camelCase-split terms appear in the name
if (rawTerms.length > 1) {
const allMatch = rawTerms.every(t => nameLower.includes(t));
if (allMatch) return 15;
}
// Name contains the full query as substring
if (nameLower.includes(queryLower)) return 10;
return 0;
}
/**
* Kind-based bonus for search ranking
* Functions and classes are typically more relevant than variables/imports
*/
export function kindBonus(kind: Node['kind']): number {
const bonuses: Record<string, number> = {
function: 10,
method: 10,
class: 8,
interface: 9,
type_alias: 6,
struct: 6,
trait: 9,
enum: 5,
component: 8,
route: 9,
module: 4,
property: 3,
field: 3,
variable: 2,
constant: 3,
import: 1,
export: 1,
parameter: 0,
namespace: 4,
file: 0,
protocol: 9,
enum_member: 3,
};
return bonuses[kind] ?? 0;
}