Skip to content

Commit 36e705b

Browse files
author
kristofr
committed
optimize literal scan
1 parent 82f7d02 commit 36e705b

7 files changed

Lines changed: 115 additions & 42 deletions

File tree

indexserver/index_queue.py

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ def __init__(self, batch_size: int = PARSE_CHUNK, max_file_bytes: int = 3 * 1024
6565
self._resolve: BackendResolver | None = None
6666
self._thread: threading.Thread | None = None
6767
self._stop = threading.Event()
68+
# tid -> (rel_path, size, t_start) for each parse worker currently busy.
69+
# Used by stop()'s diagnostic dump and the "slow parse" log.
70+
self._busy: dict[int, tuple[str, int, float]] = {}
71+
self._busy_lock = threading.Lock()
6872
# Counters
6973
self._n_enqueued = 0
7074
self._n_deduped = 0
@@ -91,8 +95,36 @@ def stop(self, timeout: float = 10.0) -> None:
9195
self._stop.set()
9296
with self._cond:
9397
self._cond.notify_all()
94-
if self._thread:
95-
self._thread.join(timeout=timeout)
98+
if not self._thread:
99+
return
100+
101+
# Poll-join with a busy-file dump every second if the worker is still alive.
102+
# Diagnostics for "queue won't drain" — shows which files parse workers
103+
# are stuck on (typically large/minified JS or deeply nested ASTs).
104+
deadline = time.monotonic() + timeout
105+
while self._thread.is_alive() and time.monotonic() < deadline:
106+
self._thread.join(timeout=1.0)
107+
if not self._thread.is_alive():
108+
break
109+
now = time.perf_counter()
110+
with self._busy_lock:
111+
snapshot = list(self._busy.items())
112+
if snapshot:
113+
files = ", ".join(
114+
f"{rel} ({size:,}B, {now - t0:.1f}s)"
115+
for _tid, (rel, size, t0) in snapshot
116+
)
117+
print(
118+
f"[index-queue] worker still alive (items_left={len(self._items)}); "
119+
f"parsing: {files}",
120+
flush=True,
121+
)
122+
else:
123+
print(
124+
f"[index-queue] worker still alive (items_left={len(self._items)}); "
125+
f"no parse workers busy",
126+
flush=True,
127+
)
96128

97129
# ── public interface ──────────────────────────────────────────────────────
98130

@@ -245,16 +277,30 @@ def _parse_one(item):
245277
if action == "delete":
246278
return ("delete", collection, _file_id(rel))
247279
try:
248-
if os.path.getsize(full_path) > max_bytes:
280+
size = os.path.getsize(full_path)
281+
if size > max_bytes:
249282
return None
250283
except OSError:
251284
return None
285+
tid = threading.get_ident()
286+
t0 = time.perf_counter()
287+
with self._busy_lock:
288+
self._busy[tid] = (rel, size, t0)
252289
try:
253290
doc = build_document(full_path, rel)
291+
t = time.perf_counter() - t0
292+
if t > 1.0:
293+
print(
294+
f"[index-queue] SLOW parse: {rel} took {t:.2f}s ({size:,} bytes)",
295+
flush=True,
296+
)
254297
if doc:
255298
return ("upsert", collection, doc)
256299
except OSError:
257300
pass
301+
finally:
302+
with self._busy_lock:
303+
self._busy.pop(tid, None)
258304
return None
259305

260306
t_parse_start = time.perf_counter()

query/cpp.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -457,13 +457,20 @@ def cpp_q_all_refs(src, tree, lines, name):
457457

458458
def _collect_all_refs(src: bytes, tree) -> set[str]:
459459
"""Deduped set of identifier/type-identifier texts from a parsed C/C++ tree,
460-
excluding tokens inside literal nodes (strings, char literals, comments)."""
460+
excluding tokens inside literal nodes (strings, char literals, comments).
461+
462+
Single-pass walk that tracks "currently inside a literal" depth to avoid
463+
O(N×D) parent-chain walks per identifier.
464+
"""
461465
out: set[str] = set()
462-
for node in _find_all(tree.root_node,
463-
lambda n: n.type in ("identifier", "type_identifier")):
464-
if _in_literal(node):
465-
continue
466-
out.add(_text(node, src).strip())
466+
stack: list = [(tree.root_node, 0)]
467+
while stack:
468+
node, lit_depth = stack.pop()
469+
new_depth = lit_depth + (1 if node.type in _LITERAL_NODES else 0)
470+
if new_depth == 0 and node.type in ("identifier", "type_identifier"):
471+
out.add(_text(node, src).strip())
472+
for child in node.children:
473+
stack.append((child, new_depth))
467474
out.discard("")
468475
return out
469476

query/cs.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1238,12 +1238,20 @@ def q_all_refs(src, tree, lines, name):
12381238

12391239
def _collect_all_refs(src: bytes, tree) -> set[str]:
12401240
"""Deduped set of identifier texts from a parsed C# tree, excluding
1241-
identifiers inside literal nodes (strings, comments, char literals)."""
1241+
identifiers inside literal nodes (strings, comments, char literals).
1242+
1243+
Single-pass walk that tracks "currently inside a literal" depth to avoid
1244+
O(N×D) parent-chain walks per identifier.
1245+
"""
12421246
out: set[str] = set()
1243-
for node in _find_all(tree.root_node, lambda n: n.type == "identifier"):
1244-
if _in_literal(node):
1245-
continue
1246-
out.add(_text(node, src))
1247+
stack: list = [(tree.root_node, 0)]
1248+
while stack:
1249+
node, lit_depth = stack.pop()
1250+
new_depth = lit_depth + (1 if node.type in _LITERAL_NODES else 0)
1251+
if new_depth == 0 and node.type == "identifier":
1252+
out.add(_text(node, src))
1253+
for child in node.children:
1254+
stack.append((child, new_depth))
12471255
return out
12481256

12491257

query/js.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -327,13 +327,21 @@ def js_q_all_refs(src, tree, lines, name):
327327
def _collect_all_refs(src: bytes, tree) -> set[str]:
328328
"""Deduped set of identifier/type-identifier texts from a parsed JS/TS tree,
329329
excluding tokens inside literal nodes (strings, template strings, comments,
330-
regex literals)."""
330+
regex literals).
331+
332+
Single-pass tree walk that tracks "currently inside a literal" depth.
333+
The naive implementation called _in_literal (parent-chain walk) for every
334+
identifier — O(N×D) and pathological on deeply-nested minified JS.
335+
"""
331336
out: set[str] = set()
332-
for node in _find_all(tree.root_node,
333-
lambda n: n.type in ("identifier", "type_identifier")):
334-
if _in_literal(node):
335-
continue
336-
out.add(_text(node, src).strip())
337+
stack: list = [(tree.root_node, 0)]
338+
while stack:
339+
node, lit_depth = stack.pop()
340+
new_depth = lit_depth + (1 if node.type in _LITERAL_NODES else 0)
341+
if new_depth == 0 and node.type in ("identifier", "type_identifier"):
342+
out.add(_text(node, src).strip())
343+
for child in node.children:
344+
stack.append((child, new_depth))
337345
out.discard("")
338346
return out
339347

query/py.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -229,12 +229,20 @@ def py_q_ident(src, tree, lines, name):
229229

230230
def _collect_all_refs(src: bytes, tree) -> set[str]:
231231
"""Deduped set of identifier texts from a parsed Python tree, excluding
232-
identifiers inside literal nodes (strings, comments)."""
232+
identifiers inside literal nodes (strings, comments).
233+
234+
Single-pass walk that tracks "currently inside a literal" depth to avoid
235+
O(N×D) parent-chain walks per identifier.
236+
"""
233237
out: set[str] = set()
234-
for node in _find_all(tree.root_node, lambda n: n.type == "identifier"):
235-
if _py_in_literal(node):
236-
continue
237-
out.add(_text(node, src))
238+
stack: list = [(tree.root_node, 0)]
239+
while stack:
240+
node, lit_depth = stack.pop()
241+
new_depth = lit_depth + (1 if node.type in _PY_LITERAL_NODES else 0)
242+
if new_depth == 0 and node.type == "identifier":
243+
out.add(_text(node, src))
244+
for child in node.children:
245+
stack.append((child, new_depth))
238246
return out
239247

240248

query/rust.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -324,13 +324,20 @@ def rust_q_all_refs(src, tree, lines, name):
324324

325325
def _collect_all_refs(src: bytes, tree) -> set[str]:
326326
"""Deduped set of identifier/type-identifier texts from a parsed Rust tree,
327-
excluding tokens inside literal nodes (strings, comments, char literals)."""
327+
excluding tokens inside literal nodes (strings, comments, char literals).
328+
329+
Single-pass walk that tracks "currently inside a literal" depth to avoid
330+
O(N×D) parent-chain walks per identifier.
331+
"""
328332
out: set[str] = set()
329-
for node in _find_all(tree.root_node,
330-
lambda n: n.type in ("identifier", "type_identifier")):
331-
if _in_literal(node):
332-
continue
333-
out.add(_text(node, src).strip())
333+
stack: list = [(tree.root_node, 0)]
334+
while stack:
335+
node, lit_depth = stack.pop()
336+
new_depth = lit_depth + (1 if node.type in _LITERAL_NODES else 0)
337+
if new_depth == 0 and node.type in ("identifier", "type_identifier"):
338+
out.add(_text(node, src).strip())
339+
for child in node.children:
340+
stack.append((child, new_depth))
334341
out.discard("")
335342
return out
336343

query/sql.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,6 @@ def _ast_text(node, src: bytes) -> str:
192192
return src[node.start_byte:node.end_byte].decode("utf-8", errors="replace")
193193

194194

195-
_SQL_LITERAL_NODES = {"comment", "marginalia", "string"}
196-
197195
_AST_TYPE_DECL_NODES = {"create_table", "create_view", "alter_table"}
198196

199197
_FUNCTION_DECL_NODES = {"create_function"}
@@ -205,15 +203,6 @@ def _ast_line(node) -> int:
205203
return node.start_point[0] + 1
206204

207205

208-
def _sql_in_literal(node) -> bool:
209-
p = node.parent
210-
while p:
211-
if p.type in _SQL_LITERAL_NODES:
212-
return True
213-
p = p.parent
214-
return False
215-
216-
217206
def _object_name(node, src: bytes) -> str:
218207
"""Extract the unqualified name from an object_reference node
219208
(e.g. 'dbo.Users' → 'Users', 'Users' → 'Users')."""

0 commit comments

Comments
 (0)