forked from colbymchenry/codegraph
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquery-parser.ts
More file actions
184 lines (173 loc) · 5.69 KB
/
query-parser.ts
File metadata and controls
184 lines (173 loc) · 5.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/**
* Field-qualified search query parser.
*
* Splits a raw query like
*
* kind:function name:auth path:src/api authenticate
*
* into structured filters (kind=function, name="auth", path prefix
* "src/api") plus the free-text portion ("authenticate") that goes
* to FTS. Free-text and filters compose: filters narrow the result
* set, FTS scores within the narrowed set.
*
* Recognised fields (case-insensitive, value is the rest until
* whitespace):
*
* kind: one of function|method|class|interface|struct|...
* lang: one of typescript|python|go|... (alias: language:)
* path: case-insensitive substring of file_path
* name: case-insensitive substring of the symbol's name
*
* Unknown field prefixes (e.g. `foo:bar`) are passed through to FTS
* as plain text — that's how someone searching for `TODO:` gets a
* result instead of a parse error.
*
* Quoting:
* kind:function path:"src/some path/with spaces" → handled by stripping
* the surrounding double quotes from the value (single token only,
* no nested escapes).
*/
import { NODE_KINDS, LANGUAGES } from '../types';
import type { NodeKind, Language } from '../types';
export interface ParsedQuery {
/** Free-text portion to feed to FTS / LIKE. May be empty. */
text: string;
/** kind: filters (OR'd). Empty when none specified. */
kinds: NodeKind[];
/** lang:/language: filters (OR'd). Empty when none specified. */
languages: Language[];
/** path: filters (OR'd, case-insensitive substring of file_path). Empty when none. */
pathFilters: string[];
/** name: filters (OR'd, case-insensitive substring of node.name). */
nameFilters: string[];
}
// Derived from the canonical `NODE_KINDS` / `LANGUAGES` arrays in
// types.ts so adding a new kind or language doesn't silently fall
// through to plain text here.
const KIND_VALUES: ReadonlySet<string> = new Set<NodeKind>(NODE_KINDS);
const LANGUAGE_VALUES: ReadonlySet<string> = new Set<Language>(LANGUAGES);
/**
* Strip a surrounding pair of double quotes from `s`. Allows users to
* keep whitespace in path filters: `path:"my dir/file"`.
*/
function unquote(s: string): string {
if (s.length >= 2 && s.startsWith('"') && s.endsWith('"')) return s.slice(1, -1);
return s;
}
/**
* Parse a raw query into structured filters + remaining text.
* Always returns a value; never throws.
*/
export function parseQuery(raw: string): ParsedQuery {
const out: ParsedQuery = {
text: '',
kinds: [],
languages: [],
pathFilters: [],
nameFilters: [],
};
// Tokenise on whitespace, preserving quoted spans as part of the
// current token. Quotes can appear at the start (`"…"`) OR mid-token
// (`path:"…"`); in both cases everything from the opening `"` to the
// matching `"` is included in the token, whitespace and all.
const tokens: string[] = [];
let i = 0;
while (i < raw.length) {
while (i < raw.length && /\s/.test(raw[i]!)) i++;
if (i >= raw.length) break;
const start = i;
while (i < raw.length && !/\s/.test(raw[i]!)) {
if (raw[i] === '"') {
const end = raw.indexOf('"', i + 1);
if (end === -1) {
// Unterminated quote — swallow the rest of the input as
// one token. Forgiving rather than throwing.
i = raw.length;
break;
}
i = end + 1;
continue;
}
i++;
}
tokens.push(raw.slice(start, i));
}
const textParts: string[] = [];
for (const tok of tokens) {
const colon = tok.indexOf(':');
if (colon <= 0 || colon === tok.length - 1) {
textParts.push(tok);
continue;
}
const key = tok.slice(0, colon).toLowerCase();
const valueRaw = unquote(tok.slice(colon + 1));
if (!valueRaw) {
textParts.push(tok);
continue;
}
switch (key) {
case 'kind': {
if (KIND_VALUES.has(valueRaw)) {
out.kinds.push(valueRaw as NodeKind);
} else {
textParts.push(tok);
}
break;
}
case 'lang':
case 'language': {
const lower = valueRaw.toLowerCase();
if (LANGUAGE_VALUES.has(lower)) {
out.languages.push(lower as Language);
} else {
textParts.push(tok);
}
break;
}
case 'path':
out.pathFilters.push(valueRaw);
break;
case 'name':
out.nameFilters.push(valueRaw);
break;
default:
textParts.push(tok);
}
}
out.text = textParts.join(' ').trim();
return out;
}
/**
* Damerau-Levenshtein-ish bounded edit distance. Returns `maxDist + 1`
* as soon as the distance is known to exceed `maxDist`; that early-exit
* makes the fuzzy fallback cheap even over tens of thousands of names.
*
* Pure DP, O(min(len(a), len(b))) memory. Compares case-folded inputs;
* callers should pass `lowercase(name)` strings.
*/
export function boundedEditDistance(a: string, b: string, maxDist: number): number {
if (a === b) return 0;
const al = a.length;
const bl = b.length;
if (Math.abs(al - bl) > maxDist) return maxDist + 1;
if (al === 0) return bl;
if (bl === 0) return al;
let prev = new Array<number>(bl + 1);
let cur = new Array<number>(bl + 1);
for (let j = 0; j <= bl; j++) prev[j] = j;
for (let i = 1; i <= al; i++) {
cur[0] = i;
let rowMin = cur[0]!;
for (let j = 1; j <= bl; j++) {
const cost = a.charCodeAt(i - 1) === b.charCodeAt(j - 1) ? 0 : 1;
const insertion = cur[j - 1]! + 1;
const deletion = prev[j]! + 1;
const substitution = prev[j - 1]! + cost;
cur[j] = Math.min(insertion, deletion, substitution);
if (cur[j]! < rowMin) rowMin = cur[j]!;
}
if (rowMin > maxDist) return maxDist + 1;
[prev, cur] = [cur, prev];
}
return prev[bl]!;
}