From 36c8dbc4041d2640df6ef779b811a5afbfe40181 Mon Sep 17 00:00:00 2001 From: Colby Mchenry Date: Tue, 19 May 2026 10:20:02 -0500 Subject: [PATCH 01/58] fix(mcp): don't block initialize handshake on heavy init (#172) (#177) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MCP `initialize` handler was awaiting `tryInitializeDefault` — which opens the SQLite DB and runs `await initGrammars()` (tree-sitter WASM bootstrap) — before sending the JSON-RPC response. On slow filesystems (Docker Desktop VirtioFS on macOS, WSL2) this could exceed Claude Code's ~30s handshake timeout, leaving the codegraph child process alive and unresponsive with no tools visible in the client. Send the response first; defer the open to a tracked background promise. The lazy retry path used by `tools/list` and `tools/call` now awaits that promise instead of racing it with `openSync`, so we never double-open the SQLite file. Adds a subprocess-based regression test that asserts the JSON-RPC response arrives on stdout before `startWatching()` logs to stderr. This ordering check catches the regression on any filesystem, not just slow ones where the timing matters in practice. Reported by @sashanclrp; isolated by @sgrimm's wire capture. Co-authored-by: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 18 ++++ __tests__/mcp-initialize.test.ts | 149 +++++++++++++++++++++++++++++++ src/mcp/index.ts | 43 ++++++--- 3 files changed, 200 insertions(+), 10 deletions(-) create mode 100644 __tests__/mcp-initialize.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 904d3cb0..8b0cfce3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,24 @@ a [GitHub Release](https://github.com/colbymchenry/codegraph/releases) tagged This project follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.7.10] - 2026-05-19 + +### Fixed +- **MCP**: tools no longer silently fail to appear in clients on slow + filesystems (Docker Desktop VirtioFS on macOS, WSL2). The `initialize` + handshake was blocking on opening the SQLite database and bootstrapping + the tree-sitter WASM runtime, which on slow I/O could exceed Claude + Code's ~30s handshake timeout — leaving the codegraph process alive but + unresponsive and no tools visible. The handshake now returns immediately + and defers project open to the background; tool calls wait on the + in-flight init rather than racing it with a second open. Closes + [#172](https://github.com/colbymchenry/codegraph/issues/172). Thanks to + [@sashanclrp](https://github.com/sashanclrp) for the original report and + detailed reproduction, and [@sgrimm](https://github.com/sgrimm) for the + decisive wire capture that isolated the actual root cause. + +[0.7.10]: https://github.com/colbymchenry/codegraph/releases/tag/v0.7.10 + ## [0.7.8] - 2026-05-17 ### Fixed diff --git a/__tests__/mcp-initialize.test.ts b/__tests__/mcp-initialize.test.ts new file mode 100644 index 00000000..4a57ebae --- /dev/null +++ b/__tests__/mcp-initialize.test.ts @@ -0,0 +1,149 @@ +/** + * MCP `initialize` handshake regression tests. + * + * Issue #172: on slow filesystems (Docker Desktop VirtioFS on macOS, WSL2), + * the MCP server was blocking the initialize response on CodeGraph.open() and + * Parser.init() (web-tree-sitter WASM bootstrap), which could take longer than + * Claude Code's ~30s handshake timeout. The child process stayed alive and + * had received the request, but never sent a response, so tools never + * appeared in the client. The fix sends the initialize response before + * kicking off the heavy init in the background. These tests guard the + * contract that initialize is fast regardless of how much work init does. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { spawn, ChildProcessWithoutNullStreams } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { CodeGraph } from '../src'; + +const BIN = path.resolve(__dirname, '../dist/bin/codegraph.js'); + +function spawnServer(cwd: string): ChildProcessWithoutNullStreams { + return spawn(process.execPath, [BIN, 'serve', '--mcp'], { + cwd, + stdio: ['pipe', 'pipe', 'pipe'], + }) as ChildProcessWithoutNullStreams; +} + +function sendInitialize(child: ChildProcessWithoutNullStreams, projectPath: string) { + const msg = JSON.stringify({ + jsonrpc: '2.0', + id: 0, + method: 'initialize', + params: { + protocolVersion: '2025-11-25', + capabilities: {}, + clientInfo: { name: 'test', version: '0.0.0' }, + rootUri: `file://${projectPath}`, + }, + }); + child.stdin.write(msg + '\n'); +} + +/** + * Collect stdout lines and stderr text from the child, tagging each piece + * with a monotonic sequence number. Lets us assert ordering between the + * JSON-RPC response (stdout) and side-effect logs (stderr). + */ +function tagStreams(child: ChildProcessWithoutNullStreams) { + const events: Array<{ seq: number; stream: 'stdout' | 'stderr'; text: string }> = []; + let seq = 0; + let stdoutBuf = ''; + let stderrBuf = ''; + child.stdout.on('data', (chunk) => { + stdoutBuf += chunk.toString('utf8'); + let idx; + while ((idx = stdoutBuf.indexOf('\n')) !== -1) { + const line = stdoutBuf.slice(0, idx); + stdoutBuf = stdoutBuf.slice(idx + 1); + events.push({ seq: seq++, stream: 'stdout', text: line }); + } + }); + child.stderr.on('data', (chunk) => { + stderrBuf += chunk.toString('utf8'); + let idx; + while ((idx = stderrBuf.indexOf('\n')) !== -1) { + const line = stderrBuf.slice(0, idx); + stderrBuf = stderrBuf.slice(idx + 1); + events.push({ seq: seq++, stream: 'stderr', text: line }); + } + }); + return events; +} + +function waitFor( + events: ReadonlyArray<{ seq: number; stream: string; text: string }>, + predicate: (e: { seq: number; stream: string; text: string }) => boolean, + timeoutMs: number, +): Promise<{ seq: number; stream: string; text: string }> { + return new Promise((resolve, reject) => { + const started = Date.now(); + const tick = () => { + const hit = events.find(predicate); + if (hit) return resolve(hit); + if (Date.now() - started > timeoutMs) { + return reject(new Error(`Timed out waiting for predicate. Events: ${JSON.stringify(events)}`)); + } + setTimeout(tick, 20); + }; + tick(); + }); +} + +describe('MCP initialize handshake (issue #172)', () => { + let tempDir: string; + let child: ChildProcessWithoutNullStreams | null = null; + + beforeEach(() => { + tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-mcp-init-')); + }); + + afterEach(() => { + if (child && !child.killed) { + child.kill('SIGKILL'); + child = null; + } + fs.rmSync(tempDir, { recursive: true, force: true }); + }); + + it('responds to initialize quickly when no .codegraph exists in cwd', async () => { + child = spawnServer(tempDir); + const events = tagStreams(child); + sendInitialize(child, tempDir); + const response = await waitFor(events, (e) => e.stream === 'stdout', 5000); + const json = JSON.parse(response.text); + expect(json.jsonrpc).toBe('2.0'); + expect(json.id).toBe(0); + expect(json.result.protocolVersion).toBeDefined(); + expect(json.result.capabilities.tools).toBeDefined(); + }, 10000); + + it('sends initialize response BEFORE tryInitializeDefault finishes', async () => { + // Seed a real .codegraph so the server's tryInitializeDefault path runs + // its full body: CodeGraph.open() (which awaits initGrammars()) and then + // startWatching() (which logs "File watcher active" to stderr). On any + // platform, that stderr log is observable evidence that tryInitializeDefault + // has completed. The contract we're protecting: the JSON-RPC response on + // stdout must arrive BEFORE that stderr log. If a future change re-awaits + // tryInitializeDefault before sendResult, this ordering inverts and the + // test fails — regardless of how fast the local filesystem is. + const cg = await CodeGraph.init(tempDir); + cg.close(); + + child = spawnServer(tempDir); + const events = tagStreams(child); + sendInitialize(child, tempDir); + + const response = await waitFor(events, (e) => e.stream === 'stdout', 10000); + const watcherLog = await waitFor( + events, + (e) => e.stream === 'stderr' && e.text.includes('File watcher active'), + 10000, + ); + expect(response.seq).toBeLessThan(watcherLog.seq); + const json = JSON.parse(response.text); + expect(json.id).toBe(0); + expect(json.result.serverInfo.name).toBe('codegraph'); + }, 20000); +}); diff --git a/src/mcp/index.ts b/src/mcp/index.ts index e516631a..924fd77e 100644 --- a/src/mcp/index.ts +++ b/src/mcp/index.ts @@ -64,6 +64,9 @@ export class MCPServer { private cg: CodeGraph | null = null; private toolHandler: ToolHandler; private projectPath: string | null; + // In-flight background init kicked off from handleInitialize. Tracked so the + // sync retry path doesn't race against it (double-opening the SQLite file). + private initPromise: Promise | null = null; constructor(projectPath?: string) { this.projectPath = projectPath || null; @@ -130,8 +133,16 @@ export class MCPServer { * Called lazily on tool calls that need the default project. * Re-walks parent directories each time so it picks up projects * initialized after the MCP server started. + * + * Awaits any in-flight background init (kicked off by handleInitialize) so + * we never open the SQLite file twice concurrently. */ - private retryInitIfNeeded(): void { + private async retryInitIfNeeded(): Promise { + // Wait for the background init started during handleInitialize, if any. + if (this.initPromise) { + try { await this.initPromise; } catch { /* errored init falls through to retry */ } + } + // Already initialized successfully if (this.toolHandler.hasDefaultCodeGraph()) return; // No project path to retry with @@ -266,13 +277,17 @@ export class MCPServer { projectPath = process.cwd(); } - // Try to initialize the default project (non-fatal if it fails) - await this.tryInitializeDefault(projectPath); - - // We accept the client's protocol version but respond with our supported version. - // The `instructions` field is surfaced by MCP clients in the agent's system - // prompt automatically — it's the right place for the universal tool-selection - // playbook, ahead of individual tool descriptions. + // Respond to the handshake BEFORE doing any heavy initialization. Loading + // the SQLite DB and the tree-sitter WASM runtime can take many seconds on + // slow filesystems (Docker Desktop VirtioFS on macOS, WSL2). Clients like + // Claude Code time out the handshake at ~30s, which manifested as + // "MCP tools never appear" — the child was alive and had received the + // initialize but was still awaiting initGrammars(). See issue #172. + // + // We accept the client's protocol version but respond with our supported + // version. The `instructions` field is surfaced by MCP clients in the + // agent's system prompt automatically — it's the right place for the + // universal tool-selection playbook, ahead of individual tool descriptions. this.transport.sendResult(request.id, { protocolVersion: PROTOCOL_VERSION, capabilities: { @@ -281,13 +296,21 @@ export class MCPServer { serverInfo: SERVER_INFO, instructions: SERVER_INSTRUCTIONS, }); + + // Kick off the default-project init in the background. Tool calls that + // arrive before it finishes will see the "not initialized yet" path and + // fall through to `retryInitIfNeeded`, which now waits for this promise + // rather than racing against it with a second open. + this.initPromise = this.tryInitializeDefault(projectPath).finally(() => { + this.initPromise = null; + }); } /** * Handle tools/list request */ private async handleToolsList(request: JsonRpcRequest): Promise { - this.retryInitIfNeeded(); + await this.retryInitIfNeeded(); this.transport.sendResult(request.id, { tools: this.toolHandler.getTools(), }); @@ -327,7 +350,7 @@ export class MCPServer { // If the default project isn't initialized yet, retry in case it was // initialized after the MCP server started (e.g. user ran codegraph init) - this.retryInitIfNeeded(); + await this.retryInitIfNeeded(); const result = await this.toolHandler.execute(toolName, toolArgs); From e176062c56a6b686e0e013260992829d11fe4937 Mon Sep 17 00:00:00 2001 From: Colby Mchenry Date: Tue, 19 May 2026 10:45:20 -0500 Subject: [PATCH 02/58] fix(cli): ASCII glyph fallback for Windows console mojibake (#168) (#178) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shimmer progress renderer writes from a worker thread via `fs.writeSync(1, ...)` to keep the animation smooth while the main thread is busy in SQLite. That path bypasses Node's TTY-aware UTF-8->codepage conversion on Windows, so glyphs like `|`/`<>`/`-` were emitted as raw UTF-8 bytes and reinterpreted by the console's OEM codepage (CP437, CP936, ...), producing strings like `鋍?[0m 鉒?[0m Scanning files 鈥?N found`. Add `src/ui/glyphs.ts` with `supportsUnicode()` detection plus matched Unicode + ASCII glyph sets, and route all CLI/shimmer output through `getGlyphs()`. Defaults: ASCII on Windows and on Linux kernel consoles (`TERM=linux`), Unicode everywhere else. `CODEGRAPH_UNICODE=1` and `CODEGRAPH_ASCII=1` are escape hatches. Co-authored-by: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 15 +++ __tests__/glyphs.test.ts | 170 ++++++++++++++++++++++++++++++++++ src/bin/codegraph.ts | 42 +++++---- src/bin/node-version-check.ts | 7 +- src/installer/index.ts | 3 +- src/ui/glyphs.ts | 91 ++++++++++++++++++ src/ui/shimmer-worker.ts | 28 +++--- 7 files changed, 322 insertions(+), 34 deletions(-) create mode 100644 __tests__/glyphs.test.ts create mode 100644 src/ui/glyphs.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b0cfce3..50cb1a5a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,21 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). [@sashanclrp](https://github.com/sashanclrp) for the original report and detailed reproduction, and [@sgrimm](https://github.com/sgrimm) for the decisive wire capture that isolated the actual root cause. +- **CLI**: terminal output no longer mojibakes on Windows PowerShell / + cmd.exe during `codegraph index` and `codegraph sync`. The shimmer + progress renderer writes from a worker thread via `fs.writeSync(1, …)` + to keep the animation smooth while the main thread is busy in SQLite, + which bypasses Node's TTY-aware UTF-8→codepage conversion — so glyphs + like `│ ◆ —` were emitted as raw UTF-8 bytes and reinterpreted as the + console's OEM codepage (CP437, CP936, …), producing strings like + `鋍?[0m 鉒?[0m Scanning files 鈥?N found`. CodeGraph now picks an ASCII + glyph set on Windows by default (`| * -` instead of `│ ◆ —`); set + `CODEGRAPH_UNICODE=1` to opt back into the Unicode glyphs (e.g. on + pwsh 7 with UTF-8 codepage), or `CODEGRAPH_ASCII=1` on any platform to + force ASCII (useful for log collectors / non-TTY pipelines). Closes + [#168](https://github.com/colbymchenry/codegraph/issues/168). Thanks to + [@starkleek](https://github.com/starkleek) for the report and to + [@Bortlesboat](https://github.com/Bortlesboat) for the initial PR. [0.7.10]: https://github.com/colbymchenry/codegraph/releases/tag/v0.7.10 diff --git a/__tests__/glyphs.test.ts b/__tests__/glyphs.test.ts new file mode 100644 index 00000000..db41a105 --- /dev/null +++ b/__tests__/glyphs.test.ts @@ -0,0 +1,170 @@ +/** + * Glyph fallback / Unicode-support detection. + * + * Pinned because the matrix is small and the consequence of regression + * is highly visible: shimmer-worker output on Windows mojibakes when + * UTF-8 glyphs are written via `fs.writeSync` (see #168). The detection + * + ASCII fallback is the contract that prevents this. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { + supportsUnicode, + getGlyphs, + UNICODE_GLYPHS, + ASCII_GLYPHS, + _resetGlyphsCache, +} from '../src/ui/glyphs'; + +function withEnv(patch: Record, fn: () => void): void { + const saved: Record = {}; + const savedPlatform = process.platform; + for (const key of Object.keys(patch)) { + saved[key] = process.env[key]; + if (patch[key] === undefined) delete process.env[key]; + else process.env[key] = patch[key]; + } + _resetGlyphsCache(); + try { + fn(); + } finally { + for (const key of Object.keys(saved)) { + if (saved[key] === undefined) delete process.env[key]; + else process.env[key] = saved[key]; + } + Object.defineProperty(process, 'platform', { value: savedPlatform }); + _resetGlyphsCache(); + } +} + +function setPlatform(value: NodeJS.Platform): void { + Object.defineProperty(process, 'platform', { value }); +} + +describe('supportsUnicode', () => { + let originalPlatform: NodeJS.Platform; + + beforeEach(() => { + originalPlatform = process.platform; + _resetGlyphsCache(); + }); + + afterEach(() => { + Object.defineProperty(process, 'platform', { value: originalPlatform }); + _resetGlyphsCache(); + }); + + it('returns false on Windows by default (mojibake-prone consoles)', () => { + withEnv({ CODEGRAPH_ASCII: undefined, CODEGRAPH_UNICODE: undefined, TERM: undefined }, () => { + setPlatform('win32'); + expect(supportsUnicode()).toBe(false); + }); + }); + + it('returns true on macOS by default', () => { + withEnv({ CODEGRAPH_ASCII: undefined, CODEGRAPH_UNICODE: undefined, TERM: undefined }, () => { + setPlatform('darwin'); + expect(supportsUnicode()).toBe(true); + }); + }); + + it('returns true on Linux by default', () => { + withEnv({ CODEGRAPH_ASCII: undefined, CODEGRAPH_UNICODE: undefined, TERM: undefined }, () => { + setPlatform('linux'); + expect(supportsUnicode()).toBe(true); + }); + }); + + it('returns false on Linux kernel console (TERM=linux)', () => { + withEnv({ CODEGRAPH_ASCII: undefined, CODEGRAPH_UNICODE: undefined, TERM: 'linux' }, () => { + setPlatform('linux'); + expect(supportsUnicode()).toBe(false); + }); + }); + + it('respects CODEGRAPH_UNICODE=1 on Windows (opt-in escape hatch)', () => { + withEnv({ CODEGRAPH_UNICODE: '1', CODEGRAPH_ASCII: undefined }, () => { + setPlatform('win32'); + expect(supportsUnicode()).toBe(true); + }); + }); + + it('respects CODEGRAPH_ASCII=1 on macOS (opt-out escape hatch)', () => { + withEnv({ CODEGRAPH_ASCII: '1', CODEGRAPH_UNICODE: undefined }, () => { + setPlatform('darwin'); + expect(supportsUnicode()).toBe(false); + }); + }); + + it('CODEGRAPH_ASCII takes precedence over CODEGRAPH_UNICODE', () => { + withEnv({ CODEGRAPH_ASCII: '1', CODEGRAPH_UNICODE: '1' }, () => { + setPlatform('darwin'); + expect(supportsUnicode()).toBe(false); + }); + }); +}); + +describe('getGlyphs', () => { + let originalPlatform: NodeJS.Platform; + + beforeEach(() => { + originalPlatform = process.platform; + _resetGlyphsCache(); + }); + + afterEach(() => { + Object.defineProperty(process, 'platform', { value: originalPlatform }); + _resetGlyphsCache(); + }); + + it('returns ASCII glyphs on Windows', () => { + withEnv({ CODEGRAPH_ASCII: undefined, CODEGRAPH_UNICODE: undefined }, () => { + setPlatform('win32'); + const g = getGlyphs(); + expect(g).toBe(ASCII_GLYPHS); + expect(g.ok).toBe('[OK]'); + expect(g.rail).toBe('|'); + expect(g.phaseDone).toBe('*'); + expect(g.dash).toBe('-'); + }); + }); + + it('returns Unicode glyphs on macOS', () => { + withEnv({ CODEGRAPH_ASCII: undefined, CODEGRAPH_UNICODE: undefined }, () => { + setPlatform('darwin'); + const g = getGlyphs(); + expect(g).toBe(UNICODE_GLYPHS); + expect(g.ok).toBe('✓'); + expect(g.rail).toBe('│'); + expect(g.phaseDone).toBe('◆'); + expect(g.dash).toBe('—'); + }); + }); + + it('caches the result so repeated calls return the same object', () => { + withEnv({ CODEGRAPH_ASCII: undefined, CODEGRAPH_UNICODE: undefined }, () => { + setPlatform('darwin'); + expect(getGlyphs()).toBe(getGlyphs()); + }); + }); +}); + +describe('Glyph sets', () => { + it('ASCII and Unicode sets cover the same keys', () => { + expect(Object.keys(ASCII_GLYPHS).sort()).toEqual(Object.keys(UNICODE_GLYPHS).sort()); + }); + + it('ASCII glyphs are all 7-bit ASCII', () => { + for (const [key, value] of Object.entries(ASCII_GLYPHS)) { + const flat = Array.isArray(value) ? value.join('') : value; + for (let i = 0; i < flat.length; i++) { + const codepoint = flat.charCodeAt(i); + expect(codepoint, `ASCII_GLYPHS.${key} contains non-ASCII char U+${codepoint.toString(16).toUpperCase().padStart(4, '0')}`).toBeLessThan(128); + } + } + }); + + it('ASCII spinner has the same frame count as the Unicode spinner', () => { + expect(ASCII_GLYPHS.spinner.length).toBe(UNICODE_GLYPHS.spinner.length); + }); +}); diff --git a/src/bin/codegraph.ts b/src/bin/codegraph.ts index f9b00bd9..2b497b98 100644 --- a/src/bin/codegraph.ts +++ b/src/bin/codegraph.ts @@ -23,6 +23,7 @@ import * as path from 'path'; import * as fs from 'fs'; import { getCodeGraphDir, isInitialized } from '../directory'; import { createShimmerProgress } from '../ui/shimmer-progress'; +import { getGlyphs } from '../ui/glyphs'; import { buildNode25BlockBanner } from './node-version-check'; @@ -32,7 +33,7 @@ async function loadCodeGraph(): Promise { return await import('../index'); } catch (err) { const msg = err instanceof Error ? err.message : String(err); - console.error('\x1b[31m✗\x1b[0m Failed to load CodeGraph modules.'); + console.error(`\x1b[31m${getGlyphs().err}\x1b[0m Failed to load CodeGraph modules.`); console.error(`\n Node: ${process.version} Platform: ${process.platform} ${process.arch}`); console.error(`\n Error: ${msg}`); console.error('\n Try reinstalling with: npm install -g @colbymchenry/codegraph\n'); @@ -212,7 +213,7 @@ function createVerboseProgress(): (progress: { phase: string; current: number; t // Log every 5% to keep output manageable if (pct >= lastPct + 5 || progress.current === progress.total) { lastPct = pct; - console.log(`[${elapsed}s] ${progress.current}/${progress.total} (${pct}%)${progress.currentFile ? ` — ${progress.currentFile}` : ''}`); + console.log(`[${elapsed}s] ${progress.current}/${progress.total} (${pct}%)${progress.currentFile ? ` ${getGlyphs().dash} ${progress.currentFile}` : ''}`); } } else if (progress.current > 0) { // Scanning phase (no total yet) — log periodically @@ -227,28 +228,28 @@ function createVerboseProgress(): (progress: { phase: string; current: number; t * Print success message */ function success(message: string): void { - console.log(chalk.green('✓') + ' ' + message); + console.log(chalk.green(getGlyphs().ok) + ' ' + message); } /** * Print error message */ function error(message: string): void { - console.error(chalk.red('✗') + ' ' + message); + console.error(chalk.red(getGlyphs().err) + ' ' + message); } /** * Print info message */ function info(message: string): void { - console.log(chalk.blue('ℹ') + ' ' + message); + console.log(chalk.blue(getGlyphs().info) + ' ' + message); } /** * Print warning message */ function warn(message: string): void { - console.log(chalk.yellow('⚠') + ' ' + message); + console.log(chalk.yellow(getGlyphs().warn) + ' ' + message); } type IndexResult = { @@ -281,7 +282,7 @@ function printIndexResult(clack: typeof import('@clack/prompts'), result: IndexR // continuing to the misleading "No files found" branch or throwing. if (!result.success && !hasErrors && result.filesIndexed === 0) { const generic = result.errors.find((e) => e.severity === 'error'); - clack.log.error(generic?.message ?? 'Indexing failed — no further details available'); + clack.log.error(generic?.message ?? `Indexing failed ${getGlyphs().dash} no further details available`); return; } @@ -293,7 +294,7 @@ function printIndexResult(clack: typeof import('@clack/prompts'), result: IndexR } clack.log.info(`${formatNumber(result.nodesCreated)} nodes, ${formatNumber(result.edgesCreated)} edges in ${formatDuration(result.durationMs)}`); } else if (hasErrors) { - clack.log.error(`Indexing failed — all ${formatNumber(result.filesErrored)} files had errors`); + clack.log.error(`Indexing failed ${getGlyphs().dash} all ${formatNumber(result.filesErrored)} files had errors`); } else { clack.log.warn('No files found to index'); } @@ -327,7 +328,7 @@ function printIndexResult(clack: typeof import('@clack/prompts'), result: IndexR } if (result.filesIndexed > 0) { - clack.log.info('The index is fully usable — only the failed files are missing.'); + clack.log.info(`The index is fully usable ${getGlyphs().dash} only the failed files are missing.`); } } else if (projectPath) { const logPath = path.join(projectPath, '.codegraph', 'errors.log'); @@ -365,7 +366,7 @@ function writeErrorLog(projectPath: string, errors: Array<{ message: string; fil } const lines: string[] = [ - `CodeGraph Error Log — ${new Date().toISOString()}`, + `CodeGraph Error Log - ${new Date().toISOString()}`, `${errorsByFile.size} files with errors`, '', ]; @@ -445,7 +446,7 @@ program verbose: true, }); } else { - process.stdout.write(`${colors.dim}│${colors.reset}\n`); + process.stdout.write(`${colors.dim}${getGlyphs().rail}${colors.reset}\n`); const progress = createShimmerProgress(); result = await cg.indexAll({ onProgress: progress.onProgress, @@ -488,7 +489,7 @@ program const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); const answer = await new Promise((resolve) => { rl.question( - chalk.yellow('⚠ This will permanently delete all CodeGraph data. Continue? (y/N) '), + chalk.yellow(`${getGlyphs().warn} This will permanently delete all CodeGraph data. Continue? (y/N) `), resolve ); }); @@ -558,7 +559,7 @@ program verbose: true, }); } else { - process.stdout.write(`${colors.dim}│${colors.reset}\n`); + process.stdout.write(`${colors.dim}${getGlyphs().rail}${colors.reset}\n`); const progress = createShimmerProgress(); result = await cg.indexAll({ onProgress: progress.onProgress, @@ -610,7 +611,7 @@ program const clack = await importESM('@clack/prompts'); clack.intro('Syncing CodeGraph'); - process.stdout.write(`${colors.dim}│${colors.reset}\n`); + process.stdout.write(`${colors.dim}${getGlyphs().rail}${colors.reset}\n`); const progress = createShimmerProgress(); const result = await cg.sync({ @@ -629,7 +630,7 @@ program if (result.filesAdded > 0) details.push(`Added: ${result.filesAdded}`); if (result.filesModified > 0) details.push(`Modified: ${result.filesModified}`); if (result.filesRemoved > 0) details.push(`Removed: ${result.filesRemoved}`); - clack.log.info(`${details.join(', ')} — ${formatNumber(result.nodesUpdated)} nodes in ${formatDuration(result.durationMs)}`); + clack.log.info(`${details.join(', ')} ${getGlyphs().dash} ${formatNumber(result.nodesUpdated)} nodes in ${formatDuration(result.durationMs)}`); } clack.outro('Done'); @@ -711,7 +712,7 @@ program // when the native build fails. const backendLabel = backend === 'native' ? chalk.green('native') - : chalk.yellow('wasm — slower fallback; run `npm rebuild better-sqlite3`'); + : chalk.yellow(`wasm ${getGlyphs().dash} slower fallback; run \`npm rebuild better-sqlite3\``); console.log(` Backend: ${backendLabel}`); console.log(); @@ -1000,8 +1001,9 @@ function printFileTree( const renderNode = (node: TreeNode, prefix: string, isLast: boolean, depth: number): void => { if (maxDepth !== undefined && depth > maxDepth) return; - const connector = isLast ? '└── ' : '├── '; - const childPrefix = isLast ? ' ' : '│ '; + const glyphs = getGlyphs(); + const connector = isLast ? glyphs.treeLast : glyphs.treeBranch; + const childPrefix = isLast ? ' ' : glyphs.treePipe; if (node.name) { let line = prefix + connector + node.name; @@ -1097,7 +1099,7 @@ program // Default: show info about MCP mode. // Use stderr so stdout stays clean for any piped/stdio usage. console.error(chalk.bold('\nCodeGraph MCP Server\n')); - console.error(chalk.blue('ℹ') + ' Use --mcp flag to start the MCP server'); + console.error(chalk.blue(getGlyphs().info) + ' Use --mcp flag to start the MCP server'); console.error('\nTo use with Claude Code, add to your MCP configuration:'); console.error(chalk.dim(` { @@ -1143,7 +1145,7 @@ program const lockPath = path.join(getCodeGraphDir(projectPath), 'codegraph.lock'); if (!fs.existsSync(lockPath)) { - info('No lock file found — nothing to do'); + info(`No lock file found ${getGlyphs().dash} nothing to do`); return; } diff --git a/src/bin/node-version-check.ts b/src/bin/node-version-check.ts index 6aed1615..4d7539a5 100644 --- a/src/bin/node-version-check.ts +++ b/src/bin/node-version-check.ts @@ -13,9 +13,12 @@ * unsupported Node.js major version (currently 25+). Pinned via unit * test so the recovery commands and override instructions can't be * silently stripped by future edits. + * + * Uses ASCII glyphs to stay readable on Windows OEM-codepage consoles + * (see ../ui/glyphs.ts for the rationale). */ export function buildNode25BlockBanner(nodeVersion: string): string { - const sep = '─'.repeat(72); + const sep = '-'.repeat(72); return [ sep, `[CodeGraph] Unsupported Node.js version: ${nodeVersion}`, @@ -29,7 +32,7 @@ export function buildNode25BlockBanner(nodeVersion: string): string { ' nvm install 22 && nvm use 22 # nvm', ' brew install node@22 && brew link --overwrite --force node@22 # Homebrew', '', - 'To override (NOT recommended — you will likely OOM):', + 'To override (NOT recommended - you will likely OOM):', ' CODEGRAPH_ALLOW_UNSAFE_NODE=1 codegraph ...', sep, ].join('\n'); diff --git a/src/installer/index.ts b/src/installer/index.ts index 32772971..833759da 100644 --- a/src/installer/index.ts +++ b/src/installer/index.ts @@ -21,6 +21,7 @@ import { resolveTargetFlag, } from './targets/registry'; import type { AgentTarget, Location, WriteResult } from './targets/types'; +import { getGlyphs } from '../ui/glyphs'; // Backwards-compat: keep these named exports — downstream code may // import them. The shim in `config-writer.ts` continues to re-export @@ -331,7 +332,7 @@ async function initializeLocalProject(clack: typeof import('@clack/prompts')): P // Index the project with shimmer progress (worker thread for smooth animation) const { createShimmerProgress } = await import('../ui/shimmer-progress'); - process.stdout.write(`\x1b[2m│\x1b[0m\n`); + process.stdout.write(`\x1b[2m${getGlyphs().rail}\x1b[0m\n`); const progress = createShimmerProgress(); const result = await cg.indexAll({ diff --git a/src/ui/glyphs.ts b/src/ui/glyphs.ts new file mode 100644 index 00000000..22aaeac2 --- /dev/null +++ b/src/ui/glyphs.ts @@ -0,0 +1,91 @@ +/** + * Glyph selection for CLI output. + * + * On Windows, console output is interpreted via the active output + * codepage. PowerShell 5.1 and cmd.exe default to OEM codepages + * (CP437, CP936, ...), so UTF-8 bytes written to the console render + * as mojibake (see #168). The shimmer worker is hit hardest because + * it uses `fs.writeSync(1, ...)` (raw bytes, no TTY-aware encoding + * conversion) to keep animation smooth while the main thread is + * blocked in SQLite. To stay readable everywhere, we fall back to + * ASCII glyphs whenever the terminal is not known to handle UTF-8. + * + * Detection is intentionally simple: + * - `CODEGRAPH_ASCII=1` -> ASCII (escape hatch for any terminal) + * - `CODEGRAPH_UNICODE=1` -> Unicode (opt-in on Windows) + * - Windows -> ASCII by default + * - Linux kernel console (`TERM=linux`) -> ASCII + * - Everything else -> Unicode + */ + +export function supportsUnicode(): boolean { + if (process.env.CODEGRAPH_ASCII === '1') return false; + if (process.env.CODEGRAPH_UNICODE === '1') return true; + if (process.platform === 'win32') return false; + return process.env.TERM !== 'linux'; +} + +export interface Glyphs { + ok: string; + err: string; + info: string; + warn: string; + spinner: string[]; + barFilled: string; + barEmpty: string; + rail: string; + phaseDone: string; + dash: string; + hLine: string; + treeBranch: string; + treeLast: string; + treePipe: string; +} + +export const UNICODE_GLYPHS: Glyphs = { + ok: '✓', + err: '✗', + info: 'ℹ', + warn: '⚠', + spinner: ['·', '✢', '✳', '✶', '✻', '✽'], + barFilled: '█', + barEmpty: '░', + rail: '│', + phaseDone: '◆', + dash: '—', + hLine: '─', + treeBranch: '├── ', + treeLast: '└── ', + treePipe: '│ ', +}; + +export const ASCII_GLYPHS: Glyphs = { + ok: '[OK]', + err: '[ERR]', + info: '[i]', + warn: '[!]', + spinner: ['.', '*', '+', 'x', 'o', 'O'], + barFilled: '#', + barEmpty: '-', + rail: '|', + phaseDone: '*', + dash: '-', + hLine: '-', + treeBranch: '|-- ', + treeLast: '`-- ', + treePipe: '| ', +}; + +let cached: Glyphs | null = null; + +export function getGlyphs(): Glyphs { + if (cached === null) { + cached = supportsUnicode() ? UNICODE_GLYPHS : ASCII_GLYPHS; + } + return cached; +} + +/** Reset the cached glyph set. Test-only; production code should call `getGlyphs()`. */ +export function _resetGlyphsCache(): void { + cached = null; +} diff --git a/src/ui/shimmer-worker.ts b/src/ui/shimmer-worker.ts index 46b91192..675408a4 100644 --- a/src/ui/shimmer-worker.ts +++ b/src/ui/shimmer-worker.ts @@ -1,5 +1,6 @@ import { parentPort, workerData } from 'worker_threads'; import { writeSync } from 'fs'; +import { getGlyphs } from './glyphs'; import type { ShimmerWorkerMessage } from './types'; // Write directly to fd 1 (stdout) instead of writeStdout(). @@ -7,11 +8,16 @@ import type { ShimmerWorkerMessage } from './types'; // thread's event loop — so if the main thread is blocked (e.g. SQLite), // stdout writes from the worker queue up and the animation freezes. // fs.writeSync(1, ...) is a direct kernel syscall that bypasses this. +// +// Side effect: bypasses Node's TTY-aware encoding conversion on Windows, +// so UTF-8 bytes hit the console raw and mojibake on OEM codepages. +// `getGlyphs()` returns ASCII fallbacks on Windows to avoid this (#168). function writeStdout(s: string): void { writeSync(1, s); } -const SPINNER_GLYPHS = ['·', '✢', '✳', '✶', '✻', '✽']; +const G = getGlyphs(); +const SPINNER_GLYPHS = G.spinner; const ANIM_INTERVAL = 150; const FRAMES_PER_GLYPH = 3; @@ -43,7 +49,7 @@ function formatNumber(n: number): string { } function renderBar(frame: number, filled: number, empty: number): string { - if (filled === 0) return `${DM}${'░'.repeat(empty)}${RST}`; + if (filled === 0) return `${DM}${G.barEmpty.repeat(empty)}${RST}`; const cycleFrames = 24; const shimmerPos = ((frame % cycleFrames) / cycleFrames) * (filled + 6) - 3; const shimmerWidth = 3; @@ -54,9 +60,9 @@ function renderBar(frame: number, filled: number, empty: number): string { const r = lerp(160, 251, t); const g = lerp(100, 191, t); const b = lerp(9, 36, t); - bar += `\x1b[38;2;${r};${g};${b}m${BOLD}█`; + bar += `\x1b[38;2;${r};${g};${b}m${BOLD}${G.barFilled}`; } - bar += `${RST}${DM}${'░'.repeat(empty)}${RST}`; + bar += `${RST}${DM}${G.barEmpty.repeat(empty)}${RST}`; return bar; } @@ -69,7 +75,7 @@ function render(): void { if (!currentMessage) return; const frame = animFrame(); const glyphIdx = Math.floor(frame / FRAMES_PER_GLYPH) % SPINNER_GLYPHS.length; - const glyph = SPINNER_GLYPHS[glyphIdx] ?? '·'; + const glyph = SPINNER_GLYPHS[glyphIdx] ?? SPINNER_GLYPHS[0] ?? '.'; const color = shimmerColor(frame); let line: string; @@ -77,11 +83,11 @@ function render(): void { const barWidth = 25; const filled = Math.round(barWidth * currentPercent / 100); const empty = barWidth - filled; - line = `${DM}│${RST} ${color}${glyph}${RST} ${currentMessage} ${renderBar(frame, filled, empty)} ${currentPercent}%`; + line = `${DM}${G.rail}${RST} ${color}${glyph}${RST} ${currentMessage} ${renderBar(frame, filled, empty)} ${currentPercent}%`; } else if (currentCount > 0) { - line = `${DM}│${RST} ${color}${glyph}${RST} ${currentMessage}... ${formatNumber(currentCount)} found`; + line = `${DM}${G.rail}${RST} ${color}${glyph}${RST} ${currentMessage}... ${formatNumber(currentCount)} found`; } else { - line = `${DM}│${RST} ${color}${glyph}${RST} ${currentMessage}...`; + line = `${DM}${G.rail}${RST} ${color}${glyph}${RST} ${currentMessage}...`; } writeStdout(`\r\x1b[K${line}`); @@ -91,9 +97,9 @@ function finishPhase(): void { if (!currentMessage) return; writeStdout(`\r\x1b[K`); let detail = ''; - if (currentPercent >= 0) detail = ' — done'; - else if (currentCount > 0) detail = ` — ${formatNumber(currentCount)} found`; - writeStdout(`${DM}│${RST} ${GRN}◆${RST} ${currentMessage}${detail}\n`); + if (currentPercent >= 0) detail = ` ${G.dash} done`; + else if (currentCount > 0) detail = ` ${G.dash} ${formatNumber(currentCount)} found`; + writeStdout(`${DM}${G.rail}${RST} ${GRN}${G.phaseDone}${RST} ${currentMessage}${detail}\n`); currentMessage = ''; currentPercent = -1; currentCount = 0; From 83f36dc1704e28a474803b8e57b97356210cecf9 Mon Sep 17 00:00:00 2001 From: Colby Mchenry Date: Tue, 19 May 2026 11:02:26 -0500 Subject: [PATCH 03/58] fix(mcp): resolve module-qualified symbol lookups (#173) (#179) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `codegraph_callees stage_apply::run` (and `_node`, `_impact`, ...) returned "not found" against a repo with 7-9 sibling Rust modules, each exporting `pub async fn run`. Two underlying issues: 1. The FTS5 query builder stripped `:` as a special char without splitting on `::`, so `stage_apply::run` collapsed to the literal `stage_applyrun` which matches nothing. Treat `::` as whitespace before the strip step so both halves become FTS tokens. 2. `matchesSymbol` only understood `Parent.child` qualifiers and relied on `qualifiedName` carrying the module path. Rust file- level functions don't have their module name in `qualifiedName` (it's encoded in the file path instead), so even dot-style lookups failed. Accept `::`, `.`, `/` as separators; multi-level forms compose; Rust `crate::`/`super::`/`self::` prefixes get stripped before path matching. Fall back to file-path containment when the qualified-name suffix doesn't match — `stage_apply::run` matches a `run` in any file whose path has a `stage_apply` segment. Also tightens the no-match branch: qualified lookups no longer fall through to a fuzzy text match. `stage_apply::nonexistent_fn` returns `null` instead of silently resolving to an unrelated `rollback` in the same file. Co-authored-by: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 23 ++++ __tests__/symbol-lookup.test.ts | 194 ++++++++++++++++++++++++++++++++ src/db/queries.ts | 8 +- src/mcp/tools.ts | 105 ++++++++++++++--- 4 files changed, 312 insertions(+), 18 deletions(-) create mode 100644 __tests__/symbol-lookup.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 50cb1a5a..30937cd6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,29 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). [#168](https://github.com/colbymchenry/codegraph/issues/168). Thanks to [@starkleek](https://github.com/starkleek) for the report and to [@Bortlesboat](https://github.com/Bortlesboat) for the initial PR. +- **MCP / search**: module-qualified symbol lookups now resolve. The + MCP tools (`codegraph_node`, `codegraph_callees`, `codegraph_impact`, + …) accept `module::symbol` (Rust / C++ / Ruby), `Module.symbol` + (TS / JS / Python), and `module/symbol` (path-style) — multi-level + forms (`crate::configurator::stage_apply::run`) and Rust path + prefixes (`crate`, `super`, `self`) are handled. Two underlying + fixes: + - The FTS5 query builder now treats `::` as a token separator + instead of stripping it to nothing, so `stage_apply::run` no + longer collapses to the unsearchable `stage_applyrun`. + - `matchesSymbol` falls back to a file-path containment check when + `qualifiedName` doesn't carry the module hierarchy (Rust file- + level functions, Python free functions in a package): a `run` + in `src/configurator/stage_apply.rs` now matches + `stage_apply::run` because `stage_apply` appears as a path + segment. + - Qualified lookups that don't match the qualifier no longer fall + through to fuzzy text matches — `stage_apply::nonexistent_fn` + returns `null` instead of resolving to an unrelated `rollback` + in the same file. + Closes [#173](https://github.com/colbymchenry/codegraph/issues/173). + Thanks to [@joselhurtado](https://github.com/joselhurtado) for the + detailed reproduction. [0.7.10]: https://github.com/colbymchenry/codegraph/releases/tag/v0.7.10 diff --git a/__tests__/symbol-lookup.test.ts b/__tests__/symbol-lookup.test.ts new file mode 100644 index 00000000..d27e157b --- /dev/null +++ b/__tests__/symbol-lookup.test.ts @@ -0,0 +1,194 @@ +/** + * Module-qualified symbol lookup (`stage_apply::run`, `Session.request`, + * `configurator/stage_apply`). + * + * Pinned because the lookup vocabulary is what makes codegraph useful + * in workspaces with same-named symbols across modules — Rust + * sub-pipelines, Python `__init__.py` packages, Java packages, etc. + * See #173 for the original report: a `run` function in + * `src/configurator/stage_apply.rs` was indexed but `stage_apply::run` + * returned "not found" because (a) FTS strips colons to nothing, + * leaving a useless query, and (b) `matchesSymbol` only understood + * `.`-style qualifiers. + */ + +import { describe, it, expect, beforeAll, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { initGrammars, loadAllGrammars } from '../src/extraction/grammars'; + +beforeAll(async () => { + await initGrammars(); + await loadAllGrammars(); +}); + +function hasSqliteBindings(): boolean { + try { + const Database = require('better-sqlite3'); + const db = new Database(':memory:'); + db.close(); + return true; + } catch { + return false; + } +} +const HAS_SQLITE = hasSqliteBindings(); + +function tmpRoot(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-symbol-lookup-')); +} + +function rmTree(dir: string): void { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); +} + +async function buildRustWorkspace(): Promise { + const root = tmpRoot(); + const cfgDir = path.join(root, 'src', 'configurator'); + fs.mkdirSync(cfgDir, { recursive: true }); + fs.writeFileSync( + path.join(root, 'Cargo.toml'), + `[package]\nname = "fixture"\nversion = "0.1.0"\nedition = "2021"\n[lib]\npath = "src/lib.rs"\n` + ); + fs.writeFileSync(path.join(root, 'src', 'lib.rs'), `pub mod configurator;\npub mod scheduler;\n`); + fs.writeFileSync( + path.join(cfgDir, 'mod.rs'), + `pub mod stage_apply;\npub mod stage_detect;\n` + ); + fs.writeFileSync( + path.join(cfgDir, 'stage_apply.rs'), + `pub async fn run() -> Result<(), ()> {\n render_and_write();\n Ok(())\n}\n\nfn render_and_write() {}\n` + ); + fs.writeFileSync( + path.join(cfgDir, 'stage_detect.rs'), + `pub async fn run() -> Result<(), ()> { Ok(()) }\n` + ); + fs.writeFileSync( + path.join(root, 'src', 'scheduler.rs'), + `pub fn run_due_tasks() -> Result<(), ()> { Ok(()) }\n` + ); + return root; +} + +describe.skipIf(!HAS_SQLITE)('matchesSymbol — module-qualified lookups (#173)', () => { + let projectRoot: string; + let cg: any; + let handler: any; + let findSymbol: (cg: any, s: string) => { node: any; note: string } | null; + let findAllSymbols: (cg: any, s: string) => { nodes: any[]; note: string }; + + beforeEach(async () => { + projectRoot = await buildRustWorkspace(); + const CodeGraph = (await import('../src/index')).default; + const { ToolHandler } = await import('../src/mcp/tools'); + cg = CodeGraph.initSync(projectRoot, { + config: { include: ['**/*.rs'], exclude: [] }, + }); + await cg.indexAll(); + handler = new ToolHandler(cg); + findSymbol = (handler as any).findSymbol.bind(handler); + findAllSymbols = (handler as any).findAllSymbols.bind(handler); + }); + + afterEach(() => { + handler?.closeAll(); + cg?.destroy(); + rmTree(projectRoot); + }); + + it('resolves `stage_apply::run` to the run in stage_apply.rs (not stage_detect.rs)', () => { + const match = findSymbol(cg, 'stage_apply::run'); + expect(match).not.toBeNull(); + expect(match!.node.name).toBe('run'); + expect(match!.node.filePath).toMatch(/configurator\/stage_apply\.rs$/); + }); + + it('rejects `stage_apply::run` for the same-named function in a different module', () => { + const all = findAllSymbols(cg, 'stage_apply::run'); + // All returned nodes must be in stage_apply.rs — never in stage_detect.rs + for (const node of all.nodes) { + expect(node.filePath).toMatch(/stage_apply\.rs$/); + } + expect(all.nodes.length).toBeGreaterThan(0); + }); + + it('resolves `configurator::stage_apply::run` (multi-level qualifier)', () => { + const match = findSymbol(cg, 'configurator::stage_apply::run'); + expect(match).not.toBeNull(); + expect(match!.node.name).toBe('run'); + expect(match!.node.filePath).toMatch(/configurator\/stage_apply\.rs$/); + }); + + it('resolves `crate::configurator::stage_apply::run` (Rust path prefix stripped)', () => { + const match = findSymbol(cg, 'crate::configurator::stage_apply::run'); + expect(match).not.toBeNull(); + expect(match!.node.filePath).toMatch(/configurator\/stage_apply\.rs$/); + }); + + it('resolves `configurator/stage_apply` (slash qualifier)', () => { + const match = findSymbol(cg, 'configurator/stage_apply/run'); + expect(match).not.toBeNull(); + expect(match!.node.filePath).toMatch(/configurator\/stage_apply\.rs$/); + }); + + it('does not silently collide bare `run` with `run_due_tasks`', () => { + const match = findSymbol(cg, 'run'); + expect(match).not.toBeNull(); + // Whatever it picks, it must be an exact-name match, not a partial. + expect(match!.node.name).toBe('run'); + }); + + it('aggregates all bare-name `run` matches across modules', () => { + const all = findAllSymbols(cg, 'run'); + const names = all.nodes.map((n: any) => n.name); + expect(names.every((n: string) => n === 'run')).toBe(true); + expect(all.nodes.length).toBeGreaterThanOrEqual(2); // stage_apply + stage_detect + // The note should call out the ambiguity. + expect(all.note).toMatch(/Aggregated|symbols named "run"/); + }); + + it('still returns null for genuinely unknown qualified lookups', () => { + const match = findSymbol(cg, 'stage_apply::nonexistent_fn'); + expect(match).toBeNull(); + }); +}); + +describe.skipIf(!HAS_SQLITE)('matchesSymbol — dotted lookups (regression for #173 fix)', () => { + let projectRoot: string; + let cg: any; + let handler: any; + let findSymbol: (cg: any, s: string) => { node: any; note: string } | null; + + beforeEach(async () => { + projectRoot = tmpRoot(); + const src = path.join(projectRoot, 'src'); + fs.mkdirSync(src, { recursive: true }); + fs.writeFileSync( + path.join(src, 'session.ts'), + `export class Session {\n request(): void {}\n}\nexport function request(): void {}\n` + ); + + const CodeGraph = (await import('../src/index')).default; + const { ToolHandler } = await import('../src/mcp/tools'); + cg = CodeGraph.initSync(projectRoot, { + config: { include: ['src/**/*.ts'], exclude: [] }, + }); + await cg.indexAll(); + handler = new ToolHandler(cg); + findSymbol = (handler as any).findSymbol.bind(handler); + }); + + afterEach(() => { + handler?.closeAll(); + cg?.destroy(); + rmTree(projectRoot); + }); + + it('`Session.request` resolves to the method, not the bare function', () => { + const match = findSymbol(cg, 'Session.request'); + expect(match).not.toBeNull(); + expect(match!.node.kind).toBe('method'); + expect(match!.node.qualifiedName).toContain('Session::request'); + }); +}); diff --git a/src/db/queries.ts b/src/db/queries.ts index db7c6118..ebba66e6 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -696,8 +696,14 @@ export class QueryBuilder { const { kinds, languages, limit = 100, offset = 0 } = options; // Add prefix wildcard for better matching (e.g., "auth" matches "AuthService", "authenticate") - // Escape special FTS5 characters and add prefix wildcard + // Escape special FTS5 characters and add prefix wildcard. + // + // `::` is a qualifier separator in Rust/C++/Ruby, not a token char, + // so treat it as whitespace before the strip step. Otherwise queries + // like `stage_apply::run` collapse to `stage_applyrun` (the colons + // are stripped without splitting) and find nothing. See #173. const ftsQuery = query + .replace(/::/g, ' ') // Rust/C++/Ruby qualifier separator .replace(/['"*():^]/g, '') // Remove FTS5 special chars .split(/\s+/) .filter(term => term.length > 0) diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index e796cfc7..9e9ef9d3 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -16,6 +16,21 @@ import { WASM_FALLBACK_FIX_RECIPE } from '../db'; /** Maximum output length to prevent context bloat (characters) */ const MAX_OUTPUT_LENGTH = 15000; +/** + * Rust path roots that have no file-system equivalent — `crate` is the + * current crate, `super` is the parent module, `self` is the current + * module. Used by `matchesSymbol` to strip these before file-path + * matching so `crate::configurator::stage_apply::run` resolves the + * same as `configurator::stage_apply::run`. + */ +const RUST_PATH_PREFIXES = new Set(['crate', 'super', 'self']); + +/** Last `::` / `.` / `/`-separated segment of a qualified symbol. */ +function lastQualifierPart(symbol: string): string { + const parts = symbol.split(/::|[./]/).filter((p) => p.length > 0); + return parts[parts.length - 1] ?? symbol; +} + /** * Calculate the recommended number of codegraph_explore calls based on project size. * Larger codebases need more exploration calls to cover their surface area, @@ -1204,9 +1219,22 @@ export class ToolHandler { * Returns the best match and a note about alternatives if any. */ /** - * Check if a node matches a symbol query, supporting both simple names and - * qualified "Parent.child" notation (e.g., "Session.request" matches a method - * named "request" inside a class named "Session"). + * Check if a node matches a symbol query. + * + * Accepts simple names (`run`) and three flavors of qualifier: + * - dotted `Session.request` (TS/JS/Python) + * - colon-pair `stage_apply::run` (Rust, C++, Ruby) + * - slash `configurator/stage_apply` (path-ish) + * + * Multi-level qualifiers compose: `crate::configurator::stage_apply::run` + * works. Rust path prefixes (`crate`, `super`, `self`) are stripped so + * the canonical `crate::module::symbol` form resolves. + * + * Resolution order, last part must always equal `node.name`: + * 1. Suffix-match against `qualifiedName` (handles class-scoped methods + * where the extractor builds the qualified name from the AST stack) + * 2. File-path containment (handles file-derived modules in Rust/ + * Python — `stage_apply::run` matches a `run` in `stage_apply.rs`) */ private matchesSymbol(node: Node, symbol: string): boolean { // Simple name match @@ -1214,21 +1242,52 @@ export class ToolHandler { // File basename match (e.g., "product-card" matches "product-card.liquid") if (node.kind === 'file' && node.name.replace(/\.[^.]+$/, '') === symbol) return true; - // Qualified name match: "Parent.child" → look for "::Parent::child" in qualified_name - if (symbol.includes('.')) { - const parts = symbol.split('.'); - const qualifiedSuffix = parts.join('::'); - if (node.qualifiedName.includes(qualifiedSuffix)) return true; - } - - return false; + // Qualified-name lookups: split on any supported separator. `\w` keeps + // identifier chars (incl. `_`) intact; everything else is treated as + // a separator we tolerate. + if (!/[.\/]|::/.test(symbol)) return false; + const parts = symbol.split(/::|[./]/).filter((p) => p.length > 0); + if (parts.length < 2) return false; + + const lastPart = parts[parts.length - 1]!; + if (node.name !== lastPart) return false; + + // Stage 1: qualified-name suffix match. The extractor joins the + // semantic hierarchy with `::`, so `Session.request` and + // `Session::request` both become `Session::request` here. + const colonSuffix = parts.join('::'); + if (node.qualifiedName.includes(colonSuffix)) return true; + + // Stage 2: file-path containment. Rust modules and Python packages + // are not in `qualifiedName` — they're encoded in the file path. So + // `stage_apply::run` matches a `run` in any file whose path + // contains a `stage_apply` segment (with or without an extension). + // + // Filter out Rust path prefixes that have no file-system equivalent. + const containerHints = parts.slice(0, -1).filter((p) => !RUST_PATH_PREFIXES.has(p)); + if (containerHints.length === 0) return false; + + const segments = node.filePath.split('/').filter((s) => s.length > 0); + return containerHints.every((hint) => + segments.some((seg) => seg === hint || seg.replace(/\.[^.]+$/, '') === hint) + ); } private findSymbol(cg: CodeGraph, symbol: string): { node: Node; note: string } | null { - // Use higher limit for qualified lookups (e.g., "Session.request") since the - // target may rank lower in FTS when there are many partial matches - const limit = symbol.includes('.') ? 50 : 10; - const results = cg.searchNodes(symbol, { limit }); + // Use higher limit for qualified lookups (e.g., "Session.request", + // "stage_apply::run") since the target may rank lower in FTS when + // there are many partial matches across the qualifier parts. + const isQualified = /[.\/]|::/.test(symbol); + const limit = isQualified ? 50 : 10; + let results = cg.searchNodes(symbol, { limit }); + + // FTS strips colons as a special char, so `stage_apply::run` searches + // for the literal `stage_applyrun` and finds nothing. Re-search by + // the bare last part and let `matchesSymbol` filter by qualifier. + if (isQualified && results.length === 0) { + const tail = lastQualifierPart(symbol); + if (tail && tail !== symbol) results = cg.searchNodes(tail, { limit }); + } if (results.length === 0 || !results[0]) { return null; @@ -1250,7 +1309,11 @@ export class ToolHandler { return { node: picked, note }; } - // No exact match, use best fuzzy match + // No exact match. For qualified lookups, don't silently fall back + // to a fuzzy result — the user typed a specific qualifier, and + // resolving `stage_apply::nonexistent_fn` to the unrelated + // `stage_apply.rs` file would be actively misleading (#173). + if (isQualified) return null; return { node: results[0]!.node, note: '' }; } @@ -1259,7 +1322,15 @@ export class ToolHandler { * results across all matching symbols (e.g., multiple classes with an `execute` method). */ private findAllSymbols(cg: CodeGraph, symbol: string): { nodes: Node[]; note: string } { - const results = cg.searchNodes(symbol, { limit: 50 }); + let results = cg.searchNodes(symbol, { limit: 50 }); + + // Mirror the fallback in `findSymbol` for qualified queries — FTS + // strips colons, so a module-qualified lookup needs a second pass + // by the bare last part. + if (results.length === 0 && /[.\/]|::/.test(symbol)) { + const tail = lastQualifierPart(symbol); + if (tail && tail !== symbol) results = cg.searchNodes(tail, { limit: 50 }); + } if (results.length === 0) { return { nodes: [], note: '' }; From fb8fb0ea8bdbe0cb08276588facdec777ecc2e3b Mon Sep 17 00:00:00 2001 From: Colby McHenry Date: Tue, 19 May 2026 11:32:47 -0500 Subject: [PATCH 04/58] release: 0.7.10 (Windows mojibake fix, module-qualified symbol lookups, MCP handshake) --- package-lock.json | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index 028c5dc8..dfcebafa 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@colbymchenry/codegraph", - "version": "0.7.9", + "version": "0.7.10", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@colbymchenry/codegraph", - "version": "0.7.9", + "version": "0.7.10", "license": "MIT", "dependencies": { "@clack/prompts": "^1.3.0", diff --git a/package.json b/package.json index 3ea0b8cf..2731804b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@colbymchenry/codegraph", - "version": "0.7.9", + "version": "0.7.10", "description": "Supercharge Claude Code with semantic code intelligence. 94% fewer tool calls • 77% faster exploration • 100% local.", "main": "dist/index.js", "types": "dist/index.d.ts", From 483ec9171c5600d44bd7f0f1e2ad977460903bb3 Mon Sep 17 00:00:00 2001 From: Colby Mchenry Date: Tue, 19 May 2026 11:42:22 -0500 Subject: [PATCH 05/58] chore(release): unwrap CHANGELOG paragraphs for GitHub Release notes (#180) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GitHub renders release-note Markdown with GFM hard breaks, so every `\n` becomes `
`. The CHANGELOG is hard-wrapped at ~75 chars for readable diffs, which renders as awkward visible line breaks on the release page (see https://github.com/colbymchenry/codegraph/releases/tag/v0.7.10). Add `scripts/extract-release-notes.mjs` to extract a version block and join indented continuation lines into a single line per bullet. Nested list items, headings, and link references are preserved. `scripts/release.sh` now uses this helper instead of the inline awk extractor — repo-level CHANGELOG.md viewing is unaffected because CommonMark there treats newlines as spaces. Also fix the 0.7.10 entry: "Two underlying fixes" -> "Three", "Rust file-/level" broken hyphen, and move the closes/credit line above the nested list so it doesn't strand as a top-level paragraph. Co-authored-by: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 15 ++-- scripts/extract-release-notes.mjs | 116 ++++++++++++++++++++++++++++++ scripts/release.sh | 12 ++-- 3 files changed, 128 insertions(+), 15 deletions(-) create mode 100755 scripts/extract-release-notes.mjs diff --git a/CHANGELOG.md b/CHANGELOG.md index 30937cd6..28f07d56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,24 +42,23 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). …) accept `module::symbol` (Rust / C++ / Ruby), `Module.symbol` (TS / JS / Python), and `module/symbol` (path-style) — multi-level forms (`crate::configurator::stage_apply::run`) and Rust path - prefixes (`crate`, `super`, `self`) are handled. Two underlying - fixes: + prefixes (`crate`, `super`, `self`) are handled. Closes + [#173](https://github.com/colbymchenry/codegraph/issues/173). Thanks + to [@joselhurtado](https://github.com/joselhurtado) for the detailed + reproduction. Three underlying fixes: - The FTS5 query builder now treats `::` as a token separator instead of stripping it to nothing, so `stage_apply::run` no longer collapses to the unsearchable `stage_applyrun`. - `matchesSymbol` falls back to a file-path containment check when - `qualifiedName` doesn't carry the module hierarchy (Rust file- - level functions, Python free functions in a package): a `run` - in `src/configurator/stage_apply.rs` now matches + `qualifiedName` doesn't carry the module hierarchy (Rust + file-level functions, Python free functions in a package): a + `run` in `src/configurator/stage_apply.rs` now matches `stage_apply::run` because `stage_apply` appears as a path segment. - Qualified lookups that don't match the qualifier no longer fall through to fuzzy text matches — `stage_apply::nonexistent_fn` returns `null` instead of resolving to an unrelated `rollback` in the same file. - Closes [#173](https://github.com/colbymchenry/codegraph/issues/173). - Thanks to [@joselhurtado](https://github.com/joselhurtado) for the - detailed reproduction. [0.7.10]: https://github.com/colbymchenry/codegraph/releases/tag/v0.7.10 diff --git a/scripts/extract-release-notes.mjs b/scripts/extract-release-notes.mjs new file mode 100755 index 00000000..3bcf7f3f --- /dev/null +++ b/scripts/extract-release-notes.mjs @@ -0,0 +1,116 @@ +#!/usr/bin/env node +/** + * Extract a release-notes block from CHANGELOG.md for a given version, + * then unwrap hard-wrapped paragraphs. + * + * Why: GitHub renders release-note Markdown with GFM hard breaks, so + * every `\n` becomes `
`. The CHANGELOG is hard-wrapped at ~75 + * chars for readable diffs, which then renders as awkward visible + * line breaks on the release page. This script joins indented + * continuation lines into a single line per bullet so the GFM + * renderer produces clean paragraphs. + * + * Repo-level CHANGELOG.md viewing is unaffected (CommonMark treats + * newlines as spaces there). + * + * Usage: extract-release-notes.mjs + * e.g. extract-release-notes.mjs 0.7.10 + */ + +import { readFileSync } from 'fs'; + +const version = process.argv[2]; +if (!version) { + console.error('usage: extract-release-notes.mjs '); + process.exit(1); +} + +const escaped = version.replace(/\./g, '\\.'); +const headerRe = new RegExp(`^## \\[${escaped}\\]`); +const anyHeaderRe = /^## \[/; + +const lines = readFileSync('CHANGELOG.md', 'utf8').split('\n'); +const start = lines.findIndex((l) => headerRe.test(l)); +if (start === -1) { + console.error(`no '## [${version}]' entry found in CHANGELOG.md`); + process.exit(1); +} +const after = lines.findIndex((l, i) => i > start && anyHeaderRe.test(l)); +const block = lines.slice(start, after === -1 ? lines.length : after); + +// Find the indent of the most recent list item; a continuation line +// whose indent is GREATER than that belongs to that item, otherwise +// it might belong to an ancestor item further up the stack. +// +// Track a stack of `{ indent: number }` frames so we can attach a +// continuation to the right ancestor. This correctly handles the +// post-nested-list continuation pattern: +// +// - top-level +// - nested +// back to top-level <- 2-space indent, joins the top-level bullet +const out = []; +let buf = ''; // pending list-item text being built +let stack = []; // [{ indent: number }] open list items + +function flushBuf() { + if (buf !== '') { + out.push(buf); + buf = ''; + } +} + +function leadingSpaces(s) { + const m = s.match(/^(\s*)/); + return m ? m[1].length : 0; +} + +const listItemRe = /^(\s*)([-*+]|\d+\.)\s+/; + +for (const line of block) { + if (/^\s*$/.test(line)) { + flushBuf(); + out.push(''); + continue; + } + if (/^#/.test(line)) { + flushBuf(); + stack = []; + out.push(line); + continue; + } + const itemMatch = line.match(listItemRe); + if (itemMatch) { + flushBuf(); + const indent = itemMatch[1].length; + while (stack.length > 0 && stack[stack.length - 1].indent >= indent) { + stack.pop(); + } + stack.push({ indent }); + buf = line; + continue; + } + if (/^\s/.test(line)) { + // Continuation. Pop any list frames deeper than this indent — the + // continuation belongs to the nearest enclosing list item. + const indent = leadingSpaces(line); + while (stack.length > 1 && stack[stack.length - 1].indent >= indent) { + // Closes the deeper item — its buffered text is already in `buf` + // belonging to the most recent flush. We need to flush before + // re-buffering for the ancestor item. + flushBuf(); + stack.pop(); + } + const trimmed = line.replace(/^\s+/, ''); + buf = buf === '' ? trimmed : `${buf} ${trimmed}`; + continue; + } + // Top-level non-list, non-heading (e.g. `[0.7.10]: https://...`) + flushBuf(); + stack = []; + out.push(line); +} +flushBuf(); + +process.stdout.write(out.join('\n')); +if (!out[out.length - 1]?.endsWith('\n')) process.stdout.write('\n'); diff --git a/scripts/release.sh b/scripts/release.sh index da6bdae5..9edf8461 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -30,13 +30,11 @@ if ! grep -q "^## \[${VERSION}\]" CHANGELOG.md; then exit 1 fi -NOTES=$(awk -v v="${VERSION}" ' - /^## \[/ { - if (p) exit - if ($0 ~ "^## \\[" v "\\]") p = 1 - } - p -' CHANGELOG.md) +# Extract notes with paragraph unwrapping — GitHub Releases render with +# GFM hard-breaks, so the CHANGELOG's hard-wrapped lines would show as +# visible `
` breaks otherwise. The helper joins continuation lines +# into a single line per bullet. +NOTES=$(node scripts/extract-release-notes.mjs "${VERSION}") if [ -z "${NOTES}" ]; then echo "error: failed to extract changelog notes for ${VERSION}" >&2 From 4bb95639cafac2aef755776e48b89b1e19aba3a3 Mon Sep 17 00:00:00 2001 From: Colby Mchenry Date: Tue, 19 May 2026 12:05:59 -0500 Subject: [PATCH 06/58] chore(release): refine release-notes extractor (#181) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes prompted by retroactively unwrapping the 0.7.6 / 0.7.7 / 0.7.9 release notes: - Add `--stdin` mode so the extractor can clean up an existing release body (via `gh release view ... --json body --jq '.body'`) without needing a matching CHANGELOG.md entry. The 0.7.9 release didn't have one — its body had been hand-rolled from the 0.7.8 entry on publish. - Stop treating `+` as a bullet marker. CommonMark allows it, but our CHANGELOG uses literal `+` inline (`MCP config + instructions`) and the script was misreading those as nested bullets. Keep `-`, `*`, and `N.` only. - Preserve fenced code blocks verbatim. The 0.7.6 entry has a triple- backtick ```bash block; the previous pass was joining its lines into one, producing unreadable code. Co-authored-by: Claude Opus 4.7 (1M context) --- scripts/extract-release-notes.mjs | 82 ++++++++++++++++++------------- 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/scripts/extract-release-notes.mjs b/scripts/extract-release-notes.mjs index 3bcf7f3f..b909bcd2 100755 --- a/scripts/extract-release-notes.mjs +++ b/scripts/extract-release-notes.mjs @@ -1,7 +1,7 @@ #!/usr/bin/env node /** - * Extract a release-notes block from CHANGELOG.md for a given version, - * then unwrap hard-wrapped paragraphs. + * Extract a release-notes block from CHANGELOG.md for a given version + * (or unwrap text supplied on stdin), then join hard-wrapped paragraphs. * * Why: GitHub renders release-note Markdown with GFM hard breaks, so * every `\n` becomes `
`. The CHANGELOG is hard-wrapped at ~75 @@ -13,45 +13,47 @@ * Repo-level CHANGELOG.md viewing is unaffected (CommonMark treats * newlines as spaces there). * - * Usage: extract-release-notes.mjs - * e.g. extract-release-notes.mjs 0.7.10 + * Usage: + * extract-release-notes.mjs # read CHANGELOG.md + * extract-release-notes.mjs --stdin # read from stdin (any text) */ import { readFileSync } from 'fs'; -const version = process.argv[2]; -if (!version) { - console.error('usage: extract-release-notes.mjs '); +const arg = process.argv[2]; +if (!arg) { + console.error('usage: extract-release-notes.mjs | --stdin'); process.exit(1); } -const escaped = version.replace(/\./g, '\\.'); -const headerRe = new RegExp(`^## \\[${escaped}\\]`); -const anyHeaderRe = /^## \[/; - -const lines = readFileSync('CHANGELOG.md', 'utf8').split('\n'); -const start = lines.findIndex((l) => headerRe.test(l)); -if (start === -1) { - console.error(`no '## [${version}]' entry found in CHANGELOG.md`); - process.exit(1); +let block; +if (arg === '--stdin') { + block = readFileSync(0, 'utf8').replace(/\r\n?/g, '\n').split('\n'); +} else { + const version = arg; + const escaped = version.replace(/\./g, '\\.'); + const headerRe = new RegExp(`^## \\[${escaped}\\]`); + const anyHeaderRe = /^## \[/; + const lines = readFileSync('CHANGELOG.md', 'utf8').split('\n'); + const start = lines.findIndex((l) => headerRe.test(l)); + if (start === -1) { + console.error(`no '## [${version}]' entry found in CHANGELOG.md`); + process.exit(1); + } + const after = lines.findIndex((l, i) => i > start && anyHeaderRe.test(l)); + block = lines.slice(start, after === -1 ? lines.length : after); } -const after = lines.findIndex((l, i) => i > start && anyHeaderRe.test(l)); -const block = lines.slice(start, after === -1 ? lines.length : after); -// Find the indent of the most recent list item; a continuation line -// whose indent is GREATER than that belongs to that item, otherwise -// it might belong to an ancestor item further up the stack. -// -// Track a stack of `{ indent: number }` frames so we can attach a -// continuation to the right ancestor. This correctly handles the -// post-nested-list continuation pattern: +// Track a stack of `{ indent: number }` frames so a continuation line +// can attach to the right ancestor. Handles the post-nested-list +// continuation pattern: // // - top-level // - nested // back to top-level <- 2-space indent, joins the top-level bullet const out = []; -let buf = ''; // pending list-item text being built -let stack = []; // [{ indent: number }] open list items +let buf = ''; +let stack = []; function flushBuf() { if (buf !== '') { @@ -65,9 +67,27 @@ function leadingSpaces(s) { return m ? m[1].length : 0; } -const listItemRe = /^(\s*)([-*+]|\d+\.)\s+/; +// Bullets: `-`, `*`, `digit.` only. `+` is intentionally excluded — the +// CHANGELOG uses literal `+` inline (`config + instructions`) and we +// don't want to misread those as nested bullets. +const listItemRe = /^(\s*)([-*]|\d+\.)\s+/; +const fenceRe = /^\s*```/; + +let inFence = false; for (const line of block) { + // Fenced code blocks: pass through verbatim, no joining. + if (fenceRe.test(line)) { + flushBuf(); + stack = []; + out.push(line); + inFence = !inFence; + continue; + } + if (inFence) { + out.push(line); + continue; + } if (/^\s*$/.test(line)) { flushBuf(); out.push(''); @@ -91,13 +111,8 @@ for (const line of block) { continue; } if (/^\s/.test(line)) { - // Continuation. Pop any list frames deeper than this indent — the - // continuation belongs to the nearest enclosing list item. const indent = leadingSpaces(line); while (stack.length > 1 && stack[stack.length - 1].indent >= indent) { - // Closes the deeper item — its buffered text is already in `buf` - // belonging to the most recent flush. We need to flush before - // re-buffering for the ancestor item. flushBuf(); stack.pop(); } @@ -105,7 +120,6 @@ for (const line of block) { buf = buf === '' ? trimmed : `${buf} ${trimmed}`; continue; } - // Top-level non-list, non-heading (e.g. `[0.7.10]: https://...`) flushBuf(); stack = []; out.push(line); From 93e53e7c69b427386e8bdb3f099d442739d7049c Mon Sep 17 00:00:00 2001 From: Colby Mchenry Date: Tue, 19 May 2026 16:23:09 -0500 Subject: [PATCH 07/58] feat(mcp): size-adaptive output budget for codegraph_explore (#185) (#187) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Output is now scaled to indexed file count. Small projects (<500 files) cap at ~18KB and skip the "Additional relevant files" / completeness / explore-budget reminders that earn their keep on larger codebases; medium (<5,000) caps at ~28KB; large (<15,000) keeps the historical ~35KB; very large goes up to ~38KB. A per-file char cap also prevents a single file with many adjacent symbols from collapsing into one whole-file dump (the pathological Alamofire `Session.swift` case reported in #185), and a per-file symbol- list cap stops the `#### path — sym(kind), ...` header from leaking multi-KB lists when many adjacent symbols cluster together. Measured against the README's benchmark repos: Alamofire (~100 files) ~62% smaller per call, Excalidraw (~600 files) ~35%, VS Code (~10k files) ~14%. Agent-trust floor preserved — Relationships, scored cluster selection, and structured-source output are all retained. Co-authored-by: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 22 ++ __tests__/explore-output-budget.test.ts | 191 +++++++++++++ src/mcp/tools.ts | 348 +++++++++++++++++++----- 3 files changed, 497 insertions(+), 64 deletions(-) create mode 100644 __tests__/explore-output-budget.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 28f07d56..828421d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,28 @@ a [GitHub Release](https://github.com/colbymchenry/codegraph/releases) tagged This project follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Changed +- **MCP / explore**: `codegraph_explore` output is now adaptive to project + size. The tool used to apply a fixed 35KB cap regardless of how large the + codebase was, which on small projects (~100 files) produced bigger + responses than the agent's native grep+Read flow would have — exactly the + scenario reported in + [#185](https://github.com/colbymchenry/codegraph/issues/185). The budget + now scales with indexed file count: small projects (<500 files) cap at + ~18KB and skip the "Additional relevant files" / completeness / explore- + budget reminders that earn their keep on bigger codebases; medium + (<5,000) caps at ~28KB; large (<15,000) keeps the historical ~35KB; very + large goes up to ~38KB. A new per-file char cap also prevents a single + file with many adjacent symbols from collapsing into one whole-file dump + (the Alamofire `Session.swift` case from #185). Measured against the + same repos used in the README benchmark: Alamofire ~62% smaller per call, + Excalidraw ~35%, VS Code ~14%. Agent-trust floor still holds — the + Relationships section, scored cluster selection, and structured-source + output are all retained. Thanks to + [@essopsp](https://github.com/essopsp) for the repro. + ## [0.7.10] - 2026-05-19 ### Fixed diff --git a/__tests__/explore-output-budget.test.ts b/__tests__/explore-output-budget.test.ts new file mode 100644 index 00000000..36717f82 --- /dev/null +++ b/__tests__/explore-output-budget.test.ts @@ -0,0 +1,191 @@ +/** + * Adaptive output budget for codegraph_explore (#185). + * + * The explore tool used to apply a fixed 35KB output cap regardless of + * project size, which on small codebases was a net loss vs. native + * grep+Read. These tests pin the per-tier budget shape so future tuning + * doesn't silently drift the small-project case back into bloat. + */ +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { getExploreOutputBudget, getExploreBudget, ToolHandler } from '../src/mcp/tools'; +import CodeGraph from '../src/index'; + +describe('getExploreOutputBudget', () => { + it('returns a strictly smaller total cap for small projects than for huge ones', () => { + const small = getExploreOutputBudget(100); + const huge = getExploreOutputBudget(30000); + expect(small.maxOutputChars).toBeLessThan(huge.maxOutputChars); + expect(small.defaultMaxFiles).toBeLessThan(huge.defaultMaxFiles); + expect(small.maxCharsPerFile).toBeLessThan(huge.maxCharsPerFile); + }); + + it('caps total output well under 8000 tokens (~32k chars) on small projects', () => { + const small = getExploreOutputBudget(100); + expect(small.maxOutputChars).toBeLessThanOrEqual(20000); + }); + + it('keeps the historical 35k+ ceiling for medium-large projects so existing benchmarks do not regress', () => { + const large = getExploreOutputBudget(10000); + expect(large.maxOutputChars).toBeGreaterThanOrEqual(35000); + }); + + it('uses tier breakpoints matching getExploreBudget so call-count and output-budget agree on a project', () => { + // Anything in the same tier should pick the same total-output cap. + const tier1a = getExploreOutputBudget(50); + const tier1b = getExploreOutputBudget(499); + expect(tier1a.maxOutputChars).toBe(tier1b.maxOutputChars); + expect(getExploreBudget(50)).toBe(getExploreBudget(499)); + + const tier2a = getExploreOutputBudget(500); + const tier2b = getExploreOutputBudget(4999); + expect(tier2a.maxOutputChars).toBe(tier2b.maxOutputChars); + expect(getExploreBudget(500)).toBe(getExploreBudget(4999)); + + const tier3a = getExploreOutputBudget(5000); + const tier3b = getExploreOutputBudget(14999); + expect(tier3a.maxOutputChars).toBe(tier3b.maxOutputChars); + + // And crossing a breakpoint changes the cap. + expect(tier1a.maxOutputChars).not.toBe(tier2a.maxOutputChars); + expect(tier2a.maxOutputChars).not.toBe(tier3a.maxOutputChars); + }); + + it('gates off "Additional relevant files", completeness signal, and budget note on small projects', () => { + const small = getExploreOutputBudget(100); + expect(small.includeAdditionalFiles).toBe(false); + expect(small.includeCompletenessSignal).toBe(false); + expect(small.includeBudgetNote).toBe(false); + }); + + it('keeps all meta-text on for projects that earn the breadth signal (>=500 files)', () => { + const medium = getExploreOutputBudget(1000); + expect(medium.includeAdditionalFiles).toBe(true); + expect(medium.includeCompletenessSignal).toBe(true); + expect(medium.includeBudgetNote).toBe(true); + }); + + it('keeps the Relationships section on for every tier — it is the cheapest structural signal', () => { + expect(getExploreOutputBudget(50).includeRelationships).toBe(true); + expect(getExploreOutputBudget(1000).includeRelationships).toBe(true); + expect(getExploreOutputBudget(10000).includeRelationships).toBe(true); + expect(getExploreOutputBudget(30000).includeRelationships).toBe(true); + }); + + it('caps the per-file header symbol list more tightly on small projects', () => { + // Without this cap, a file like Alamofire's Session.swift produced + // a 3.4KB symbol list in the `#### path — sym, sym, ...` header, + // dwarfing the per-file body cap. + const small = getExploreOutputBudget(100); + const huge = getExploreOutputBudget(30000); + expect(small.maxSymbolsInFileHeader).toBeLessThan(huge.maxSymbolsInFileHeader); + expect(small.maxSymbolsInFileHeader).toBeGreaterThan(0); + }); + + it('uses a tighter clustering gap threshold on small projects to break runaway single clusters', () => { + const small = getExploreOutputBudget(100); + const huge = getExploreOutputBudget(30000); + expect(small.gapThreshold).toBeLessThanOrEqual(huge.gapThreshold); + }); + + it('handles the boundary file counts exactly (off-by-one regression guard)', () => { + // 499 -> small tier, 500 -> medium tier + expect(getExploreOutputBudget(499).maxOutputChars).toBe(getExploreOutputBudget(100).maxOutputChars); + expect(getExploreOutputBudget(500).maxOutputChars).toBe(getExploreOutputBudget(1000).maxOutputChars); + // 4999 -> medium, 5000 -> large + expect(getExploreOutputBudget(4999).maxOutputChars).toBe(getExploreOutputBudget(1000).maxOutputChars); + expect(getExploreOutputBudget(5000).maxOutputChars).toBe(getExploreOutputBudget(10000).maxOutputChars); + // 14999 -> large, 15000 -> xlarge + expect(getExploreOutputBudget(14999).maxOutputChars).toBe(getExploreOutputBudget(10000).maxOutputChars); + expect(getExploreOutputBudget(15000).maxOutputChars).toBe(getExploreOutputBudget(30000).maxOutputChars); + }); +}); + +/** + * End-to-end check that the budget is actually applied by handleExplore. + * + * Builds a tiny synthetic project (<500 files, so the small tier), indexes + * it, and confirms the output: + * - stays under the small-tier maxOutputChars cap + * - omits the meta-text the small tier gates off (completeness signal, + * budget note, "Additional relevant files") + * + * Regression guard for #185 — protects against future edits to handleExplore + * silently re-introducing the fixed 35KB cap on small projects. + */ +describe('codegraph_explore output respects the adaptive budget', () => { + let testDir: string; + let cg: CodeGraph; + let handler: ToolHandler; + + beforeAll(async () => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-explore-budget-')); + const srcDir = path.join(testDir, 'src'); + fs.mkdirSync(srcDir); + + // A handful of files with one fat target file. The fat file mimics the + // Alamofire Session.swift case: many methods stacked on top of each other, + // which collapsed into one giant cluster pre-#185. + const fatLines: string[] = ['export class Session {']; + for (let i = 0; i < 30; i++) { + fatLines.push(` method${i}(arg: string): string {`); + fatLines.push(` return this.helper${i}(arg) + "${i}";`); + fatLines.push(` }`); + fatLines.push(` private helper${i}(arg: string): string {`); + fatLines.push(` return arg.repeat(${i + 1});`); + fatLines.push(` }`); + } + fatLines.push('}'); + fs.writeFileSync(path.join(srcDir, 'session.ts'), fatLines.join('\n')); + + // A few small supporting files so the project has >1 indexed file. + for (let i = 0; i < 5; i++) { + fs.writeFileSync( + path.join(srcDir, `support${i}.ts`), + `import { Session } from './session';\nexport function callSession${i}(s: Session) { return s.method${i}('hi'); }\n` + ); + } + + cg = CodeGraph.initSync(testDir, { + config: { include: ['**/*.ts'], exclude: [] }, + }); + await cg.indexAll(); + handler = new ToolHandler(cg); + }); + + afterAll(() => { + if (cg) cg.destroy(); + if (testDir && fs.existsSync(testDir)) { + fs.rmSync(testDir, { recursive: true, force: true }); + } + }); + + it('keeps total output under the small-project cap', async () => { + const result = await handler.execute('codegraph_explore', { query: 'Session method helper' }); + const text = result.content?.[0]?.text ?? ''; + const smallBudget = getExploreOutputBudget(100); + // Allow a small overshoot for the trailing markers — the cap is enforced + // per-file rather than as an absolute output ceiling. + expect(text.length).toBeLessThan(smallBudget.maxOutputChars + 500); + }); + + it('omits the meta-text gated off for small projects', async () => { + const result = await handler.execute('codegraph_explore', { query: 'Session method helper' }); + const text = result.content?.[0]?.text ?? ''; + expect(text).not.toContain('### Additional relevant files'); + expect(text).not.toContain('Complete source code is included above'); + expect(text).not.toContain('Explore budget:'); + }); + + it('still includes the Relationships section — it is the cheapest structural signal', async () => { + const result = await handler.execute('codegraph_explore', { query: 'Session method helper' }); + const text = result.content?.[0]?.text ?? ''; + // Either there are relationships, or no edges were significant — both are fine. + // We just want to confirm we did not accidentally gate it off. + const hasRelationships = text.includes('### Relationships'); + const sourceFollowsHeader = text.indexOf('### Source Code') > 0; + expect(hasRelationships || sourceFollowsHeader).toBe(true); + }); +}); diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 9e9ef9d3..21767906 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -44,6 +44,104 @@ export function getExploreBudget(fileCount: number): number { return 5; } +/** + * Adaptive output budget for `codegraph_explore`, scaled to project size. + * + * Smaller codebases get a tighter total cap, fewer default files, smaller + * per-file cap, and tighter clustering — so a focused query on a 100-file + * project doesn't dump a whole file's worth of source into the agent's + * context. Larger codebases keep the generous defaults because the + * agent's native discovery cost (grep + find + many Reads) genuinely + * dwarfs a fat explore call at that scale. + * + * Meta-text (relationships map, "additional relevant files" list, + * completeness signal, budget note) is gated off for tiny projects + * where one rich call is the whole story and the extra prose is just + * overhead. + * + * Tier breakpoints mirror `getExploreBudget` so a project sits in the + * same tier across both knobs. + */ +export interface ExploreOutputBudget { + /** Hard cap on total output characters. */ + maxOutputChars: number; + /** Default `maxFiles` when the caller didn't specify one. */ + defaultMaxFiles: number; + /** Cap on contiguous source returned per file (across all its clusters). */ + maxCharsPerFile: number; + /** Cluster gap threshold in lines — tighter clustering on small projects. */ + gapThreshold: number; + /** Max symbols listed in the per-file header (`#### path — sym(kind), ...`). */ + maxSymbolsInFileHeader: number; + /** Max edges shown per relationship kind in the Relationships section. */ + maxEdgesPerRelationshipKind: number; + /** Include the "Relationships" section. */ + includeRelationships: boolean; + /** Include the "Additional relevant files (not shown)" trailing list. */ + includeAdditionalFiles: boolean; + /** Include the "Complete source code is included above…" reminder. */ + includeCompletenessSignal: boolean; + /** Include the explore-budget reminder at the end. */ + includeBudgetNote: boolean; +} + +export function getExploreOutputBudget(fileCount: number): ExploreOutputBudget { + if (fileCount < 500) { + return { + maxOutputChars: 18000, + defaultMaxFiles: 5, + maxCharsPerFile: 3800, + gapThreshold: 8, + maxSymbolsInFileHeader: 6, + maxEdgesPerRelationshipKind: 6, + includeRelationships: true, + includeAdditionalFiles: false, + includeCompletenessSignal: false, + includeBudgetNote: false, + }; + } + if (fileCount < 5000) { + return { + maxOutputChars: 28000, + defaultMaxFiles: 9, + maxCharsPerFile: 5000, + gapThreshold: 12, + maxSymbolsInFileHeader: 10, + maxEdgesPerRelationshipKind: 10, + includeRelationships: true, + includeAdditionalFiles: true, + includeCompletenessSignal: true, + includeBudgetNote: true, + }; + } + if (fileCount < 15000) { + return { + maxOutputChars: 35000, + defaultMaxFiles: 12, + maxCharsPerFile: 7000, + gapThreshold: 15, + maxSymbolsInFileHeader: 15, + maxEdgesPerRelationshipKind: 15, + includeRelationships: true, + includeAdditionalFiles: true, + includeCompletenessSignal: true, + includeBudgetNote: true, + }; + } + return { + maxOutputChars: 38000, + defaultMaxFiles: 14, + maxCharsPerFile: 7000, + gapThreshold: 15, + maxSymbolsInFileHeader: 15, + maxEdgesPerRelationshipKind: 15, + includeRelationships: true, + includeAdditionalFiles: true, + includeCompletenessSignal: true, + includeBudgetNote: true, + }; +} + /** * Mark a Claude session as having consulted MCP tools. * This enables Grep/Glob/Bash commands that would otherwise be blocked. @@ -656,24 +754,35 @@ export class ToolHandler { return this.textResult(this.truncateOutput(formatted)); } - /** Maximum output for explore tool — sized to stay under MCP client token limits (~10k tokens) */ - private static readonly EXPLORE_MAX_OUTPUT = 35000; - /** * Handle codegraph_explore — deep exploration in a single call * * Strategy: find relevant symbols via graph traversal, group by file, * then read contiguous file sections covering all symbols per file. * This replaces multiple codegraph_node + Read calls. + * + * Output size is adaptive to project file count via + * `getExploreOutputBudget` — see #185 for why a fixed 35k cap was a + * tax on small projects while earning its keep on large ones. */ private async handleExplore(args: Record): Promise { const query = this.validateString(args.query, 'query'); if (typeof query !== 'string') return query; const cg = this.getCodeGraph(args.projectPath as string | undefined); - const maxFiles = clamp((args.maxFiles as number) || 12, 1, 20); const projectRoot = cg.getProjectRoot(); + // Resolve adaptive output budget from project size. Falls back to the + // largest-tier defaults if stats aren't available, which preserves + // pre-#185 behavior for callers that hit the rare stats failure. + let budget: ExploreOutputBudget; + try { + budget = getExploreOutputBudget(cg.getStats().fileCount); + } catch { + budget = getExploreOutputBudget(Infinity); + } + const maxFiles = clamp((args.maxFiles as number) || budget.defaultMaxFiles, 1, 20); + // Step 1: Find relevant context with generous parameters. // Use a large maxNodes budget — explore has its own 35k char output limit // that prevents context bloat, so more nodes just means better coverage @@ -765,7 +874,7 @@ export class ToolHandler { e.kind !== 'contains' // skip contains — it's implied by file grouping ); - if (significantEdges.length > 0) { + if (budget.includeRelationships && significantEdges.length > 0) { lines.push('### Relationships'); lines.push(''); @@ -782,14 +891,14 @@ export class ToolHandler { } for (const [kind, edges] of byKind) { - // Show up to 15 relationships per kind - const shown = edges.slice(0, 15); + const cap = budget.maxEdgesPerRelationshipKind; + const shown = edges.slice(0, cap); lines.push(`**${kind}:**`); for (const e of shown) { lines.push(`- ${e.source} → ${e.target}`); } - if (edges.length > 15) { - lines.push(`- ... and ${edges.length - 15} more`); + if (edges.length > cap) { + lines.push(`- ... and ${edges.length - cap} more`); } lines.push(''); } @@ -801,10 +910,11 @@ export class ToolHandler { let totalChars = lines.join('\n').length; let filesIncluded = 0; + let anyFileTrimmed = false; for (const [filePath, group] of sortedFiles) { if (filesIncluded >= maxFiles) break; - if (totalChars > ToolHandler.EXPLORE_MAX_OUTPUT * 0.9) break; + if (totalChars > budget.maxOutputChars * 0.9) break; const absPath = validatePathWithinRoot(projectRoot, filePath); if (!absPath || !existsSync(absPath)) continue; @@ -820,14 +930,26 @@ export class ToolHandler { const lang = group.nodes[0]?.language || ''; // Cluster nearby symbols to avoid reading huge gaps between distant symbols. - // Sort by start line, then merge overlapping/adjacent ranges (within 15 lines). - // Include both node ranges AND edge source locations so template sections - // with component usages/calls are covered (not just script block symbols). - const ranges: Array<{ start: number; end: number; name: string; kind: string }> = group.nodes + // Sort by start line, then merge overlapping/adjacent ranges (within the + // adaptive gap threshold). Include both node ranges AND edge source + // locations so template sections with component usages/calls are + // covered (not just script block symbols). + // + // Each range carries an `importance` score so we can rank clusters + // when the per-file budget forces us to drop some: entry-point nodes + // are worth 10, directly-connected nodes 3, peripheral nodes 1, and + // bare edge-source lines 2 (less than a connected node but more than + // a peripheral one — they hint at a reference but aren't a definition). + const ranges: Array<{ start: number; end: number; name: string; kind: string; importance: number }> = group.nodes .filter(n => n.startLine > 0 && n.endLine > 0) // Skip file/component nodes that span the entire file — they'd create one giant cluster .filter(n => !(n.kind === 'component' && n.startLine === 1 && n.endLine >= fileLines.length - 1)) - .map(n => ({ start: n.startLine, end: n.endLine, name: n.name, kind: n.kind })); + .map(n => { + let importance = 1; + if (entryNodeIds.has(n.id)) importance = 10; + else if (connectedToEntry.has(n.id)) importance = 3; + return { start: n.startLine, end: n.endLine, name: n.name, kind: n.kind, importance }; + }); // Add edge source locations in this file — captures template references // (component usages, event handlers) that aren't nodes themselves. @@ -844,7 +966,7 @@ export class ToolHandler { // Look up target name from subgraph first, fall back to edge kind const targetNode = subgraph.nodes.get(edge.target); const targetName = targetNode?.name ?? edge.kind; - ranges.push({ start: edge.line, end: edge.line, name: targetName, kind: edge.kind }); + ranges.push({ start: edge.line, end: edge.line, name: targetName, kind: edge.kind, importance: 2 }); } } @@ -852,46 +974,129 @@ export class ToolHandler { if (ranges.length === 0) continue; - const GAP_THRESHOLD = 15; // merge sections within 15 lines of each other - const clusters: Array<{ start: number; end: number; symbols: string[] }> = []; - let current = { start: ranges[0]!.start, end: ranges[0]!.end, symbols: [`${ranges[0]!.name}(${ranges[0]!.kind})`] }; + const gapThreshold = budget.gapThreshold; + const clusters: Array<{ start: number; end: number; symbols: string[]; score: number }> = []; + let current = { + start: ranges[0]!.start, + end: ranges[0]!.end, + symbols: [`${ranges[0]!.name}(${ranges[0]!.kind})`], + score: ranges[0]!.importance, + }; for (let i = 1; i < ranges.length; i++) { const r = ranges[i]!; - if (r.start <= current.end + GAP_THRESHOLD) { + if (r.start <= current.end + gapThreshold) { current.end = Math.max(current.end, r.end); current.symbols.push(`${r.name}(${r.kind})`); + current.score += r.importance; } else { clusters.push(current); - current = { start: r.start, end: r.end, symbols: [`${r.name}(${r.kind})`] }; + current = { + start: r.start, + end: r.end, + symbols: [`${r.name}(${r.kind})`], + score: r.importance, + }; } } clusters.push(current); - // Build file section output from clusters + // Build file section output from clusters, capped by per-file budget. + // The pathological case (#185): a file like Session.swift where every + // method is adjacent collapses into one cluster spanning the whole + // file, and dumping that into the agent's context is most of the + // token cost on small projects. We pick clusters in score order + // (importance per line, so we don't prefer one giant low-density + // cluster over several focused ones) until the per-file char cap is + // hit. Truly enormous single clusters get tail-trimmed with a marker. const contextPadding = 3; + const buildSection = (c: { start: number; end: number }): string => { + const startIdx = Math.max(0, c.start - 1 - contextPadding); + const endIdx = Math.min(fileLines.length, c.end + contextPadding); + return fileLines.slice(startIdx, endIdx).join('\n'); + }; + const GAP_MARKER = '\n\n// ... (gap) ...\n\n'; + + // Score clusters by score-per-line (density) so a 30-line cluster + // with two entry symbols outranks a 400-line cluster with two + // peripheral symbols. Stable tiebreak by score, then by smaller + // span (cheaper to include). + const rankedClusters = clusters + .map((c, i) => ({ idx: i, span: c.end - c.start + 1, c })) + .sort((a, b) => { + const densityA = a.c.score / a.span; + const densityB = b.c.score / b.span; + if (densityB !== densityA) return densityB - densityA; + if (b.c.score !== a.c.score) return b.c.score - a.c.score; + return a.span - b.span; + }); + + const chosenIndices = new Set(); + let projectedChars = 0; + for (const rc of rankedClusters) { + const sectionLen = buildSection(rc.c).length + (chosenIndices.size > 0 ? GAP_MARKER.length : 0); + // Always take the top-ranked cluster, even if oversize, so we don't + // return an empty file section (agent would then re-Read the file, + // negating the savings). + if (chosenIndices.size === 0) { + chosenIndices.add(rc.idx); + projectedChars += sectionLen; + continue; + } + if (projectedChars + sectionLen > budget.maxCharsPerFile) continue; + chosenIndices.add(rc.idx); + projectedChars += sectionLen; + } + + // Emit chosen clusters in source order so the file reads top-to-bottom. let fileSection = ''; const allSymbols: string[] = []; - - for (const cluster of clusters) { - const startIdx = Math.max(0, cluster.start - 1 - contextPadding); - const endIdx = Math.min(fileLines.length, cluster.end + contextPadding); - const section = fileLines.slice(startIdx, endIdx).join('\n'); - - if (fileSection.length > 0) { - fileSection += '\n\n// ... (gap) ...\n\n'; - } + let fileTrimmed = false; + for (let i = 0; i < clusters.length; i++) { + if (!chosenIndices.has(i)) continue; + const cluster = clusters[i]!; + const section = buildSection(cluster); + if (fileSection.length > 0) fileSection += GAP_MARKER; fileSection += section; allSymbols.push(...cluster.symbols); } - // Skip if this section would blow the output limit - if (totalChars + fileSection.length + 200 > ToolHandler.EXPLORE_MAX_OUTPUT) { - const budget = ToolHandler.EXPLORE_MAX_OUTPUT - totalChars - 200; - if (budget < 500) break; - const trimmed = fileSection.slice(0, budget) + '\n// ... trimmed ...'; + // If a single chosen cluster is still oversize (long monolithic + // function), tail-trim it. Better one trimmed view than nothing. + if (fileSection.length > budget.maxCharsPerFile) { + fileSection = fileSection.slice(0, budget.maxCharsPerFile) + '\n// ... trimmed ...'; + fileTrimmed = true; + } + if (chosenIndices.size < clusters.length || fileTrimmed) { + anyFileTrimmed = true; + } - lines.push(`#### ${filePath} — ${allSymbols.join(', ')}`); + // Dedupe + cap the symbols list shown in the per-file header. Some + // files (Session.swift in Alamofire) produced 3.4KB symbol lists + // from cluster scoring + edge-source lines, dwarfing the per-file + // body cap. Show top names by frequency, with a "+N more" tail. + const symbolCounts = new Map(); + for (const s of allSymbols) { + symbolCounts.set(s, (symbolCounts.get(s) ?? 0) + 1); + } + const sortedSymbols = [...symbolCounts.entries()] + .sort((a, b) => b[1] - a[1]) + .map(([name]) => name); + const headerCap = budget.maxSymbolsInFileHeader; + const headerSymbols = sortedSymbols.slice(0, headerCap); + const omittedCount = sortedSymbols.length - headerSymbols.length; + const headerSuffix = omittedCount > 0 + ? `${headerSymbols.join(', ')}, +${omittedCount} more` + : headerSymbols.join(', '); + const fileHeader = `#### ${filePath} — ${headerSuffix}`; + + // Respect the total output cap on a file-by-file basis. + if (totalChars + fileSection.length + 200 > budget.maxOutputChars) { + const remaining = budget.maxOutputChars - totalChars - 200; + if (remaining < 500) break; + const trimmed = fileSection.slice(0, remaining) + '\n// ... trimmed ...'; + + lines.push(fileHeader); lines.push(''); lines.push('```' + lang); lines.push(trimmed); @@ -899,10 +1104,11 @@ export class ToolHandler { lines.push(''); totalChars += trimmed.length + 200; filesIncluded++; + anyFileTrimmed = true; break; } - lines.push(`#### ${filePath} — ${allSymbols.join(', ')}`); + lines.push(fileHeader); lines.push(''); lines.push('```' + lang); lines.push(fileSection); @@ -913,37 +1119,51 @@ export class ToolHandler { filesIncluded++; } - // Add remaining files as references (from both relevant and peripheral files) - const remainingRelevant = sortedFiles.slice(filesIncluded); - const peripheralFiles = [...fileGroups.entries()] - .filter(([, group]) => group.score < 3) - .sort((a, b) => b[1].score - a[1].score); - const remainingFiles = [...remainingRelevant, ...peripheralFiles]; - if (remainingFiles.length > 0) { - lines.push('### Additional relevant files (not shown)'); - lines.push(''); - for (const [filePath, group] of remainingFiles.slice(0, 10)) { - const symbols = group.nodes.map(n => `${n.name}:${n.startLine}`).join(', '); - lines.push(`- ${filePath}: ${symbols}`); - } - if (remainingFiles.length > 10) { - lines.push(`- ... and ${remainingFiles.length - 10} more files`); + // Add remaining files as references (from both relevant and peripheral files). + // Small projects (per budget) skip this — the relevant story already fits + // in the source section, and a trailing pointer list is pure overhead. + if (budget.includeAdditionalFiles) { + const remainingRelevant = sortedFiles.slice(filesIncluded); + const peripheralFiles = [...fileGroups.entries()] + .filter(([, group]) => group.score < 3) + .sort((a, b) => b[1].score - a[1].score); + const remainingFiles = [...remainingRelevant, ...peripheralFiles]; + if (remainingFiles.length > 0) { + lines.push('### Additional relevant files (not shown)'); + lines.push(''); + for (const [filePath, group] of remainingFiles.slice(0, 10)) { + const symbols = group.nodes.map(n => `${n.name}:${n.startLine}`).join(', '); + lines.push(`- ${filePath}: ${symbols}`); + } + if (remainingFiles.length > 10) { + lines.push(`- ... and ${remainingFiles.length - 10} more files`); + } } } - // Add completeness signal so agents know they don't need to re-read these files - lines.push(''); - lines.push('---'); - lines.push(`> **Complete source code is included above for ${filesIncluded} files.** You do NOT need to re-read these files — the relevant sections are already shown in full. Only use Read/Grep for files listed under "Additional relevant files" if you need more detail.`); + // Add completeness signal so agents know they don't need to re-read these files. + // On small projects the budget gates this off — but if we actually had to + // trim or drop clusters, surface a brief note so the agent knows it can + // still Read for more detail. + if (budget.includeCompletenessSignal) { + lines.push(''); + lines.push('---'); + lines.push(`> **Complete source code is included above for ${filesIncluded} files.** You do NOT need to re-read these files — the relevant sections are already shown in full. Only use Read/Grep for files listed under "Additional relevant files" if you need more detail.`); + } else if (anyFileTrimmed) { + lines.push(''); + lines.push(`> Some file sections were trimmed for size. Use \`codegraph_node\` or Read for the full source if needed.`); + } // Add explore budget note based on project size - try { - const stats = cg.getStats(); - const budget = getExploreBudget(stats.fileCount); - lines.push(''); - lines.push(`> **Explore budget: ${budget} calls max for this project (${stats.fileCount.toLocaleString()} files indexed).** Stop exploring and synthesize your answer once you've used ${budget} calls — do NOT make additional explore calls beyond this budget.`); - } catch { - // Stats unavailable — skip budget note + if (budget.includeBudgetNote) { + try { + const stats = cg.getStats(); + const callBudget = getExploreBudget(stats.fileCount); + lines.push(''); + lines.push(`> **Explore budget: ${callBudget} calls max for this project (${stats.fileCount.toLocaleString()} files indexed).** Stop exploring and synthesize your answer once you've used ${callBudget} calls — do NOT make additional explore calls beyond this budget.`); + } catch { + // Stats unavailable — skip budget note + } } return this.textResult(lines.join('\n')); From 2c1a314b84fd3633624f10f752163f9629c105e2 Mon Sep 17 00:00:00 2001 From: Colby Mchenry Date: Tue, 19 May 2026 17:16:12 -0500 Subject: [PATCH 08/58] feat(mcp): line numbers in explore output + per-file cluster fixes (#188) * feat(mcp): line numbers in explore output + per-file cluster fixes Follow-up to #185. Three changes to codegraph_explore: 1. Source sections now carry cat -n style line-number prefixes (\t), so the agent can cite file:line straight from the payload instead of re-Reading the file just to recover a line number. Isolated A/B: the no-line-numbers arm spent 2 Reads + a grep to find a line number the line-numbered arm cited with zero follow-up calls. Payload cost ~3-5%. Toggle off with CODEGRAPH_EXPLORE_LINENUMS=0. 2. Per-file cluster selection now ranks clusters containing a query entry point ahead of dense declaration blocks. Density-only ranking buried the relevant methods (perform/didCreateURLRequest/task in Alamofire's Session.swift) under the top-of-file class header + property list. 3. Whole-file "envelope" nodes (a class/struct/etc. spanning >50% of the file) are excluded from clustering. The Session class spans ~1,400 lines; keeping it collapsed every method into one giant cluster that tail-trimmed down to just the class header, hiding the methods. Net vs the 0.7.10 baseline, line numbers on: Alamofire -60%, Excalidraw -32%, VS Code -12% per explore call. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(mcp): language-neutral omission markers in explore output The gap separator and the two tail-trim markers used C-style `//` comments, which aren't comments in Python, Ruby, etc. Switch to plain `... (gap) ...` / `... (trimmed) ...` so they read correctly inside any language's fenced source block. With line numbers on, the line-number jump already corroborates a gap. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(mcp): language-neutral truncation marker in codegraph_context Sibling to the explore marker fix: codegraph_context's code-block truncation used a C-style `// ... truncated ...`. Switch to `... (truncated) ...` so it reads correctly in any language's fenced source block. Co-Authored-By: Claude Opus 4.7 (1M context) * chore(release): bump version to 0.7.11 --------- Co-authored-by: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 35 ++++++++-- __tests__/explore-output-budget.test.ts | 43 ++++++++++++ package-lock.json | 4 +- package.json | 2 +- src/context/index.ts | 6 +- src/mcp/tools.ts | 87 ++++++++++++++++++++----- 6 files changed, 150 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 828421d5..7c32c152 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,18 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- **MCP / explore**: `codegraph_explore` source sections now carry line + numbers (cat -n style `\t`, matching the Read tool). This lets + the agent cite `file:line` straight from the explore payload instead of + re-opening the file just to find a line number — the dominant residual + cost on precise-tracing questions. In an isolated A/B (answer a + "which exact line" question with the relevant code already in the + payload), the no-line-numbers arm spent 2 file Reads + a grep recovering + the line number while the line-numbered arm answered with zero follow-up + tool calls. Payload cost is small (~3-5%). Set + `CODEGRAPH_EXPLORE_LINENUMS=0` to disable. + ### Changed - **MCP / explore**: `codegraph_explore` output is now adaptive to project size. The tool used to apply a fixed 35KB cap regardless of how large the @@ -22,12 +34,23 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). (<5,000) caps at ~28KB; large (<15,000) keeps the historical ~35KB; very large goes up to ~38KB. A new per-file char cap also prevents a single file with many adjacent symbols from collapsing into one whole-file dump - (the Alamofire `Session.swift` case from #185). Measured against the - same repos used in the README benchmark: Alamofire ~62% smaller per call, - Excalidraw ~35%, VS Code ~14%. Agent-trust floor still holds — the - Relationships section, scored cluster selection, and structured-source - output are all retained. Thanks to - [@essopsp](https://github.com/essopsp) for the repro. + (the Alamofire `Session.swift` case from #185). Per-file cluster + selection ranks clusters that contain a query entry point ahead of dense + declaration blocks, and whole-file "envelope" nodes (a class/struct that + spans most of the file) are excluded from clustering so the methods the + query asked about aren't buried under the container's opening lines. + Measured against the same repos used in the README benchmark, end state + with line numbers on: Alamofire ~60% smaller per call, Excalidraw ~32%, + VS Code ~12%. Agent-trust floor still holds — the Relationships section, + scored cluster selection, and structured-source output are all retained. + Thanks to [@essopsp](https://github.com/essopsp) for the repro. + +### Fixed +- **MCP**: source-omission markers in `codegraph_explore` and + `codegraph_context` output are now language-neutral (`... (gap) ...`, + `... (trimmed) ...`, `... (truncated) ...`) instead of C-style `//` + comments, which were misleading inside Python, Ruby, and other non-C + fenced source blocks. ## [0.7.10] - 2026-05-19 diff --git a/__tests__/explore-output-budget.test.ts b/__tests__/explore-output-budget.test.ts index 36717f82..65ddc648 100644 --- a/__tests__/explore-output-budget.test.ts +++ b/__tests__/explore-output-budget.test.ts @@ -188,4 +188,47 @@ describe('codegraph_explore output respects the adaptive budget', () => { const sourceFollowsHeader = text.indexOf('### Source Code') > 0; expect(hasRelationships || sourceFollowsHeader).toBe(true); }); + + it('prefixes source lines with line numbers by default (cat -n style)', async () => { + delete process.env.CODEGRAPH_EXPLORE_LINENUMS; + const result = await handler.execute('codegraph_explore', { query: 'Session method helper' }); + const text = result.content?.[0]?.text ?? ''; + // At least one fenced source line should look like `\t`. + expect(/\n\d+\t/.test(text)).toBe(true); + }); + + it('omits line numbers when CODEGRAPH_EXPLORE_LINENUMS=0', async () => { + process.env.CODEGRAPH_EXPLORE_LINENUMS = '0'; + try { + const result = await handler.execute('codegraph_explore', { query: 'Session method helper' }); + const text = result.content?.[0]?.text ?? ''; + // The synthetic source has no tab-prefixed numeric lines of its own, + // so none should appear when the toggle is off. + expect(/\n\d+\t(?:export| )/.test(text)).toBe(false); + } finally { + delete process.env.CODEGRAPH_EXPLORE_LINENUMS; + } + }); + + it('uses language-neutral omission markers (no C-style // in the output)', async () => { + // The gap/trimmed separators must not assume `//` is a comment — that's + // wrong in Python, Ruby, etc. They render inside fenced source blocks. + const result = await handler.execute('codegraph_explore', { query: 'Session method helper' }); + const text = result.content?.[0]?.text ?? ''; + expect(text).not.toContain('// ... (gap)'); + expect(text).not.toContain('// ... trimmed'); + }); + + it('does not collapse a whole-file class into just its header (envelope filter)', async () => { + // The synthetic `Session` class spans the entire file. Without the + // envelope filter it would form one giant cluster that tail-trims to + // the class declaration, hiding the methods. Confirm real method bodies + // make it into the output. Regression guard for the #185 follow-up. + const result = await handler.execute('codegraph_explore', { query: 'Session method helper' }); + const text = result.content?.[0]?.text ?? ''; + // A method body line (`methodN(arg: string)`) should appear, not just + // the `export class Session {` opener. + const hasMethodBody = /method\d+\(arg: string\)/.test(text); + expect(hasMethodBody).toBe(true); + }); }); diff --git a/package-lock.json b/package-lock.json index dfcebafa..2d4e515a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@colbymchenry/codegraph", - "version": "0.7.10", + "version": "0.7.11", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@colbymchenry/codegraph", - "version": "0.7.10", + "version": "0.7.11", "license": "MIT", "dependencies": { "@clack/prompts": "^1.3.0", diff --git a/package.json b/package.json index 2731804b..60dc5c71 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@colbymchenry/codegraph", - "version": "0.7.10", + "version": "0.7.11", "description": "Supercharge Claude Code with semantic code intelligence. 94% fewer tool calls • 77% faster exploration • 100% local.", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/src/context/index.ts b/src/context/index.ts index 94192377..7298cd41 100644 --- a/src/context/index.ts +++ b/src/context/index.ts @@ -1006,9 +1006,11 @@ export class ContextBuilder { const code = await this.extractNodeCode(node); if (code) { - // Truncate if too long + // Truncate if too long. Language-neutral marker (no `//` — not a + // comment in Python, Ruby, etc.); this renders inside a fenced + // source block whose language varies. const truncated = code.length > maxBlockSize - ? code.slice(0, maxBlockSize) + '\n// ... truncated ...' + ? code.slice(0, maxBlockSize) + '\n... (truncated) ...' : code; blocks.push({ diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 21767906..7b0d55b0 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -142,6 +142,38 @@ export function getExploreOutputBudget(fileCount: number): ExploreOutputBudget { }; } +/** + * Whether `codegraph_explore` should prefix source lines with their line + * numbers (cat -n style: `\t`). + * + * Line numbers let the agent cite `file:line` straight from the explore + * payload instead of re-Reading the file just to find a line number — the + * dominant residual cost on precise-tracing questions (#185 follow-up). + * + * Defaults ON. Set `CODEGRAPH_EXPLORE_LINENUMS=0` to disable (used by the + * A/B harness to measure the payload-cost vs. read-savings tradeoff). + */ +function exploreLineNumbersEnabled(): boolean { + return process.env.CODEGRAPH_EXPLORE_LINENUMS !== '0'; +} + +/** + * Prefix each line of a source slice with its 1-based line number, matching + * the Read tool's `cat -n` convention (number + tab) so the agent treats it + * the same way it treats Read output. + * + * @param slice contiguous source text (already extracted from the file) + * @param firstLineNumber the 1-based line number of the slice's first line + */ +function numberSourceLines(slice: string, firstLineNumber: number): string { + const out: string[] = []; + const split = slice.split('\n'); + for (let i = 0; i < split.length; i++) { + out.push(`${firstLineNumber + i}\t${split[i]}`); + } + return out.join('\n'); +} + /** * Mark a Claude session as having consulted MCP tools. * This enables Grep/Glob/Bash commands that would otherwise be blocked. @@ -940,10 +972,19 @@ export class ToolHandler { // are worth 10, directly-connected nodes 3, peripheral nodes 1, and // bare edge-source lines 2 (less than a connected node but more than // a peripheral one — they hint at a reference but aren't a definition). + // Container kinds whose body can span most/all of a file. When such a + // node covers most of the file we drop it from the ranges: keeping it + // would merge every method inside it into one giant cluster spanning + // the whole file, which then tail-trims down to just the container's + // opening lines (its header/declarations) and buries the methods the + // query actually asked about (#185 follow-up — Session.swift in + // Alamofire is the canonical case: the `Session` class spans ~1,400 + // lines). We want the granular symbols inside, not the envelope. + const ENVELOPE_KINDS = new Set(['file', 'module', 'class', 'struct', 'interface', 'enum', 'namespace', 'protocol', 'trait', 'component']); const ranges: Array<{ start: number; end: number; name: string; kind: string; importance: number }> = group.nodes .filter(n => n.startLine > 0 && n.endLine > 0) - // Skip file/component nodes that span the entire file — they'd create one giant cluster - .filter(n => !(n.kind === 'component' && n.startLine === 1 && n.endLine >= fileLines.length - 1)) + // Drop whole-file envelope nodes (containers covering >50% of the file). + .filter(n => !(ENVELOPE_KINDS.has(n.kind) && (n.endLine - n.startLine + 1) > fileLines.length * 0.5)) .map(n => { let importance = 1; if (entryNodeIds.has(n.id)) importance = 10; @@ -975,12 +1016,13 @@ export class ToolHandler { if (ranges.length === 0) continue; const gapThreshold = budget.gapThreshold; - const clusters: Array<{ start: number; end: number; symbols: string[]; score: number }> = []; + const clusters: Array<{ start: number; end: number; symbols: string[]; score: number; maxImportance: number }> = []; let current = { start: ranges[0]!.start, end: ranges[0]!.end, symbols: [`${ranges[0]!.name}(${ranges[0]!.kind})`], score: ranges[0]!.importance, + maxImportance: ranges[0]!.importance, }; for (let i = 1; i < ranges.length; i++) { @@ -989,6 +1031,7 @@ export class ToolHandler { current.end = Math.max(current.end, r.end); current.symbols.push(`${r.name}(${r.kind})`); current.score += r.importance; + current.maxImportance = Math.max(current.maxImportance, r.importance); } else { clusters.push(current); current = { @@ -996,6 +1039,7 @@ export class ToolHandler { end: r.end, symbols: [`${r.name}(${r.kind})`], score: r.importance, + maxImportance: r.importance, }; } } @@ -1005,25 +1049,36 @@ export class ToolHandler { // The pathological case (#185): a file like Session.swift where every // method is adjacent collapses into one cluster spanning the whole // file, and dumping that into the agent's context is most of the - // token cost on small projects. We pick clusters in score order - // (importance per line, so we don't prefer one giant low-density - // cluster over several focused ones) until the per-file char cap is - // hit. Truly enormous single clusters get tail-trimmed with a marker. + // token cost on small projects. We pick clusters in priority order + // until the per-file char cap is hit. Truly enormous single clusters + // get tail-trimmed with a marker. const contextPadding = 3; + const withLineNumbers = exploreLineNumbersEnabled(); const buildSection = (c: { start: number; end: number }): string => { const startIdx = Math.max(0, c.start - 1 - contextPadding); const endIdx = Math.min(fileLines.length, c.end + contextPadding); - return fileLines.slice(startIdx, endIdx).join('\n'); + const slice = fileLines.slice(startIdx, endIdx).join('\n'); + // startIdx is 0-based, so the slice's first line is line startIdx + 1. + return withLineNumbers ? numberSourceLines(slice, startIdx + 1) : slice; }; - const GAP_MARKER = '\n\n// ... (gap) ...\n\n'; - - // Score clusters by score-per-line (density) so a 30-line cluster - // with two entry symbols outranks a 400-line cluster with two - // peripheral symbols. Stable tiebreak by score, then by smaller - // span (cheaper to include). + // Language-neutral separator (no `//` — not a comment in Python, Ruby, + // etc.). With line numbers on, the line-number jump also signals the gap. + const GAP_MARKER = '\n\n... (gap) ...\n\n'; + + // Rank clusters for inclusion under the per-file cap. Entry-point + // clusters come first: a cluster containing a query entry point + // (importance 10) must outrank a dense block of mere declarations, + // otherwise on a large file like Session.swift the top-of-file class + // header + property list (many adjacent low-importance nodes, high + // density) wins the budget and buries the actual methods the query + // asked about (perform/didCreateURLRequest/task live deep in the + // file). Within the same importance tier, prefer density (score per + // line) so we still favor focused clusters over sprawling ones, then + // smaller span as a cheap-to-include tiebreak. const rankedClusters = clusters .map((c, i) => ({ idx: i, span: c.end - c.start + 1, c })) .sort((a, b) => { + if (b.c.maxImportance !== a.c.maxImportance) return b.c.maxImportance - a.c.maxImportance; const densityA = a.c.score / a.span; const densityB = b.c.score / b.span; if (densityB !== densityA) return densityB - densityA; @@ -1064,7 +1119,7 @@ export class ToolHandler { // If a single chosen cluster is still oversize (long monolithic // function), tail-trim it. Better one trimmed view than nothing. if (fileSection.length > budget.maxCharsPerFile) { - fileSection = fileSection.slice(0, budget.maxCharsPerFile) + '\n// ... trimmed ...'; + fileSection = fileSection.slice(0, budget.maxCharsPerFile) + '\n... (trimmed) ...'; fileTrimmed = true; } if (chosenIndices.size < clusters.length || fileTrimmed) { @@ -1094,7 +1149,7 @@ export class ToolHandler { if (totalChars + fileSection.length + 200 > budget.maxOutputChars) { const remaining = budget.maxOutputChars - totalChars - 200; if (remaining < 500) break; - const trimmed = fileSection.slice(0, remaining) + '\n// ... trimmed ...'; + const trimmed = fileSection.slice(0, remaining) + '\n... (trimmed) ...'; lines.push(fileHeader); lines.push(''); From 1cbca5a51e94341046e8ce89dbae5d20f237f84a Mon Sep 17 00:00:00 2001 From: Colby McHenry Date: Wed, 20 May 2026 08:25:43 -0500 Subject: [PATCH 09/58] docs: add Star History chart to README --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 910d7801..49cf8d54 100644 --- a/README.md +++ b/README.md @@ -492,6 +492,16 @@ The `.codegraph/config.json` file controls indexing: **Missing symbols** — The MCP server auto-syncs on save (wait a couple seconds). Run `codegraph sync` manually if needed. Check that the file's language is supported and isn't excluded by config patterns. +## Star History + + + + + + Star History Chart + + + ## License MIT From 7fe64b32be0a08b35d737e76dcbb79c79ddea408 Mon Sep 17 00:00:00 2001 From: Colby McHenry Date: Wed, 20 May 2026 09:39:17 -0500 Subject: [PATCH 10/58] feat(eval): add agent-eval harness and /audit + /publish Claude skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the old interactive publish.js script with two Claude skills and a full agent-evaluation harness: - `.claude/skills/audit/` — `/audit` skill drives `scripts/agent-eval/audit.sh` to benchmark retrieval quality (with vs. without codegraph) on a chosen real-world repo from the new `corpus.json` (17 repos across 14 languages). - `.claude/skills/publish/` — `/publish` skill orchestrates the full release workflow (preflight → changelog → confirmation gate → bump/build → npm publish → GitHub release), replacing `publish.js`. - `scripts/agent-eval/` — headless (`run-agent.sh`, `run-all.sh`) and interactive tmux (`itrun.sh`) harnesses with stream-json parsers (`parse-run.mjs`, `parse-session.mjs`) that report tool calls, token usage, and a VERDICT line summarising codegraph_explore vs. Read/Grep counts. - `run-interactive-test.md` — documents the two harnesses, idle-detection approach, and what "good" agent behavior looks like after explore-first guidance. --- .claude/skills/audit/SKILL.md | 74 +++++++++++++++ .claude/skills/audit/corpus.json | 63 +++++++++++++ .claude/skills/publish/SKILL.md | 136 +++++++++++++++++++++++++++ publish.js | 65 ------------- run-interactive-test.md | 131 ++++++++++++++++++++++++++ scripts/agent-eval/audit.sh | 68 ++++++++++++++ scripts/agent-eval/itrun.sh | 107 +++++++++++++++++++++ scripts/agent-eval/parse-run.mjs | 45 +++++++++ scripts/agent-eval/parse-session.mjs | 93 ++++++++++++++++++ scripts/agent-eval/run-agent.sh | 34 +++++++ scripts/agent-eval/run-all.sh | 67 +++++++++++++ 11 files changed, 818 insertions(+), 65 deletions(-) create mode 100644 .claude/skills/audit/SKILL.md create mode 100644 .claude/skills/audit/corpus.json create mode 100644 .claude/skills/publish/SKILL.md delete mode 100644 publish.js create mode 100644 run-interactive-test.md create mode 100755 scripts/agent-eval/audit.sh create mode 100755 scripts/agent-eval/itrun.sh create mode 100644 scripts/agent-eval/parse-run.mjs create mode 100644 scripts/agent-eval/parse-session.mjs create mode 100755 scripts/agent-eval/run-agent.sh create mode 100755 scripts/agent-eval/run-all.sh diff --git a/.claude/skills/audit/SKILL.md b/.claude/skills/audit/SKILL.md new file mode 100644 index 00000000..ee13ebe1 --- /dev/null +++ b/.claude/skills/audit/SKILL.md @@ -0,0 +1,74 @@ +--- +name: audit +description: Benchmark CodeGraph retrieval quality on a real codebase by comparing agent behavior with vs without CodeGraph. Use when the user runs /audit or asks to test, benchmark, audit, or validate a codegraph version (the local dev build or a published npm version) against a language's repo. +--- + +# CodeGraph Quality Audit + +Measures how much CodeGraph helps an agent versus plain grep/read, for a chosen +codegraph version on a chosen real-world repo. Drives the harness in +`scripts/agent-eval/`. + +## Prerequisites +- `tmux` 3+, a logged-in `claude` CLI, `node`, `git` (macOS/Linux). +- Run from the codegraph repo root. + +## Workflow + +Copy this checklist: +``` +- [ ] 1. Pick version (local or npm) +- [ ] 2. Pick language +- [ ] 3. Pick repo by size +- [ ] 4. Pick harness (headless / tmux / both) +- [ ] 5. Run audit.sh in the background +- [ ] 6. Report results +``` + +**Step 1 — version.** Ask with `AskUserQuestion`: which codegraph version to test. +Offer "Local dev build" and "Latest published"; the free-text "Other" lets the +user type a specific version (e.g. `0.7.10`). Map the answer to a VERSION token: +- "Local dev build" → `local` +- "Latest published" → `latest` +- a typed version → that string (e.g. `0.7.10`) + +**Step 2 — language.** Read `.claude/skills/audit/corpus.json`. Ask with +`AskUserQuestion` which language to test, listing the languages that have entries. + +**Step 3 — repo.** From the chosen language's entries, ask which repo. Label each +option with its size and file count, e.g. `excalidraw — Medium (~600 files)`. +Each entry carries the `repo` URL and a representative `question`. + +**Step 4 — harness.** Ask with `AskUserQuestion` which harness to run, and map +the answer to a MODE token: +- "Headless" → `headless` — `claude -p` with stream-json: exact tokens/cost and a + clean tool sequence (2 runs, fast, no TTY). +- "Interactive (tmux)" → `tmux` — drives the real Claude TUI in tmux: faithful + Explore-subagent behavior, metrics from session logs (2 runs, slower). +- "Both" → `all` — headless + interactive (4 runs). + +**Step 5 — run.** Launch in the background (sets the version, clones if missing, +wipes + re-indexes, runs the chosen arms — several minutes): +```bash +scripts/agent-eval/audit.sh "" +``` + +**Step 6 — report.** When the job finishes, read the log and report per arm: +- Headless (`parse-run.mjs`): total tool calls, file `Read`s, Grep/Bash, + codegraph-tool calls, duration, **total cost**. +- Interactive (`parse-session.mjs`): the `VERDICT: codegraph_explore used Nx | + Read N | Grep/Bash N` and `TOKENS:` lines. + +Lead with cost + tool/Read counts — they are the reliable signals; raw token +in/out are confounded by subagent delegation and prompt caching. State whether +codegraph reduced effort and whether both arms reached a correct answer. + +## Notes +- The index is rebuilt every run (`audit.sh` wipes `.codegraph`) — different + versions extract differently, so an index must be served by the same binary + that built it. +- `audit.sh` temporarily mutates the global `codegraph` install for the test, + then restores your dev link via `local-install.sh`. +- Corpus repos are cloned to `/tmp/codegraph-corpus` (reused if already present). +- Add or edit repos in `corpus.json` (fields: `name`, `repo`, `size`, `files`, + `question`). diff --git a/.claude/skills/audit/corpus.json b/.claude/skills/audit/corpus.json new file mode 100644 index 00000000..4b48dab0 --- /dev/null +++ b/.claude/skills/audit/corpus.json @@ -0,0 +1,63 @@ +{ + "_comment": "Test corpus for /audit. Add entries freely. size: Small (<~150 files), Medium (~150-1500), Large (>~1500). 'question' is a representative architectural question that exercises cross-file understanding.", + "TypeScript": [ + { "name": "ky", "repo": "https://github.com/sindresorhus/ky", "size": "Small", "files": "~25", "question": "How does ky implement request retries and timeouts?" }, + { "name": "excalidraw", "repo": "https://github.com/excalidraw/excalidraw", "size": "Medium", "files": "~600", "question": "How does Excalidraw render and update canvas elements?" }, + { "name": "vscode", "repo": "https://github.com/microsoft/vscode", "size": "Large", "files": "~10000", "question": "How does the extension host communicate with the main process?" } + ], + "JavaScript": [ + { "name": "express", "repo": "https://github.com/expressjs/express", "size": "Small", "files": "~50", "question": "How does Express route a request through its middleware stack?" } + ], + "Go": [ + { "name": "cobra", "repo": "https://github.com/spf13/cobra", "size": "Small", "files": "~50", "question": "How does cobra parse commands and flags?" }, + { "name": "gin", "repo": "https://github.com/gin-gonic/gin", "size": "Medium", "files": "~150", "question": "How does gin route requests through its middleware chain?" }, + { "name": "terraform", "repo": "https://github.com/hashicorp/terraform", "size": "Large", "files": "~4000", "question": "How does Terraform build and walk the resource dependency graph?" } + ], + "Python": [ + { "name": "click", "repo": "https://github.com/pallets/click", "size": "Small", "files": "~60", "question": "How does click parse command-line arguments into commands?" }, + { "name": "flask", "repo": "https://github.com/pallets/flask", "size": "Medium", "files": "~90", "question": "How does Flask dispatch a request to a view function?" }, + { "name": "django", "repo": "https://github.com/django/django", "size": "Large", "files": "~2700", "question": "How does Django's ORM build and execute a query from a QuerySet?" } + ], + "Rust": [ + { "name": "clap", "repo": "https://github.com/clap-rs/clap", "size": "Medium", "files": "~200", "question": "How does clap parse arguments against a derived command definition?" }, + { "name": "tokio", "repo": "https://github.com/tokio-rs/tokio", "size": "Large", "files": "~700", "question": "How does tokio schedule and run async tasks on its runtime?" }, + { "name": "deno", "repo": "https://github.com/denoland/deno", "size": "Large", "files": "~1500", "question": "How does Deno load and execute a TypeScript module?" } + ], + "Java": [ + { "name": "gson", "repo": "https://github.com/google/gson", "size": "Medium", "files": "~200", "question": "How does Gson serialize an object to JSON?" }, + { "name": "okhttp", "repo": "https://github.com/square/okhttp", "size": "Medium", "files": "~640", "question": "How does OkHttp process a request through its interceptor chain?" }, + { "name": "guava", "repo": "https://github.com/google/guava", "size": "Large", "files": "~3000", "question": "How does Guava's CacheBuilder build and configure a cache?" } + ], + "Kotlin": [ + { "name": "koin", "repo": "https://github.com/InsertKoinIO/koin", "size": "Medium", "files": "~300", "question": "How does Koin resolve and inject dependencies?" }, + { "name": "leakcanary", "repo": "https://github.com/square/leakcanary", "size": "Medium", "files": "~250", "question": "How does LeakCanary detect and analyze a memory leak?" } + ], + "Swift": [ + { "name": "alamofire", "repo": "https://github.com/Alamofire/Alamofire", "size": "Small", "files": "~100", "question": "How does Alamofire build, send, and validate a request?" } + ], + "C#": [ + { "name": "serilog", "repo": "https://github.com/serilog/serilog", "size": "Medium", "files": "~250", "question": "How does Serilog route a log event to its sinks?" }, + { "name": "jellyfin", "repo": "https://github.com/jellyfin/jellyfin", "size": "Large", "files": "~2500", "question": "How does Jellyfin scan and identify items in a media library?" } + ], + "Ruby": [ + { "name": "sinatra", "repo": "https://github.com/sinatra/sinatra", "size": "Small", "files": "~60", "question": "How does Sinatra match a request to a route handler?" }, + { "name": "discourse", "repo": "https://github.com/discourse/discourse", "size": "Large", "files": "~3000", "question": "How does Discourse create and render a new post?" } + ], + "PHP": [ + { "name": "slim", "repo": "https://github.com/slimphp/Slim", "size": "Small", "files": "~80", "question": "How does Slim handle a request through its middleware?" }, + { "name": "laravel", "repo": "https://github.com/laravel/framework", "size": "Large", "files": "~3000", "question": "How does Laravel resolve and dispatch a route to a controller?" } + ], + "C": [ + { "name": "redis", "repo": "https://github.com/redis/redis", "size": "Large", "files": "~600", "question": "How does Redis parse and dispatch a client command?" } + ], + "C++": [ + { "name": "json", "repo": "https://github.com/nlohmann/json", "size": "Small", "files": "~100", "question": "How does nlohmann::json parse a JSON string into a value?" }, + { "name": "grpc", "repo": "https://github.com/grpc/grpc", "size": "Large", "files": "~3000", "question": "How does gRPC dispatch an incoming RPC to its handler?" } + ], + "Dart": [ + { "name": "flutter", "repo": "https://github.com/flutter/flutter", "size": "Large", "files": "~6000", "question": "How does Flutter build and lay out a widget tree?" } + ], + "Svelte": [ + { "name": "shadcn-svelte", "repo": "https://github.com/huntabyte/shadcn-svelte", "size": "Medium", "files": "~600", "question": "How do shadcn-svelte components compose and apply their styling?" } + ] +} diff --git a/.claude/skills/publish/SKILL.md b/.claude/skills/publish/SKILL.md new file mode 100644 index 00000000..84c6d4b3 --- /dev/null +++ b/.claude/skills/publish/SKILL.md @@ -0,0 +1,136 @@ +--- +name: publish +description: Publishes a new minor or major release of this npm package (codegraph). Reads the latest version from npm, generates a user-perspective CHANGELOG entry from commits since the last tag, bumps package.json, publishes to npm, and creates the matching GitHub release. Use when the user runs /publish or asks to cut, ship, or publish a release / new version. +--- + +# Publish a release + +Cut a **minor or major** release: generate the changelog, bump, publish to npm, and create the GitHub release. Patch releases are intentionally not offered here. + +This skill performs the actual publish (npm publish, git push, GitHub release) — that is the whole point of invoking it, so the general "hand the user the commands" rule does **not** apply inside `/publish`. The **confirmation gate in Step 5 is the safeguard**: never run a step past it without explicit approval. + +Run from the repo root. + +## Workflow + +Copy this checklist and work through it in order: + +``` +- [ ] 1. Preflight: branch, sync, auth +- [ ] 2. Read base version from npm, compute candidates +- [ ] 3. Ask the user: minor or major +- [ ] 4. Generate the CHANGELOG entry from commits since the last tag +- [ ] 5. CONFIRMATION GATE — show changelog + plan, get explicit approval +- [ ] 6. Write CHANGELOG.md, bump, build +- [ ] 7. Commit + push +- [ ] 8. npm publish +- [ ] 9. scripts/release.sh (GitHub release) +- [ ] 10. Verify on the npm registry +``` + +### Step 1 — Preflight + +```bash +git rev-parse --abbrev-ref HEAD # expect: main +git fetch origin +git status --porcelain # working tree should be clean +git rev-list --left-right --count origin/main...HEAD # " " +npm whoami # npm auth (publish will fail without it) +gh auth status # gh auth (release.sh needs it) +``` + +- If not on `main`, stop and ask the user to confirm releasing from this branch. +- If behind origin, `git pull --ff-only` so the final push is a fast-forward. +- If the tree has **unrelated** uncommitted changes, stop and ask — the release commit only stages 3 files, but a dirty tree usually means something's mid-flight. +- If `npm whoami` or `gh auth status` fails, stop and tell the user to authenticate. + +### Step 2 — Base version + candidates + +The latest **published** version is the source of truth, not local `package.json`. + +```bash +PKG=$(node -p "require('./package.json').name") +BASE=$(npm view "$PKG" version) +node -e "const [a,b]=process.argv[1].split('.').map(Number);console.log('minor ->',a+'.'+(b+1)+'.0');console.log('major ->',(a+1)+'.0.0')" "$BASE" +``` + +Note if local `package.json` differs from `BASE` (an unpublished bump) — surface it, but still base the new version on npm. + +### Step 3 — Ask minor or major + +Use the **AskUserQuestion** tool with the two computed candidates as options (show the resulting version in each label, e.g. "minor → 0.8.0"). Set the new version from the answer. + +### Step 4 — Generate the changelog entry + +```bash +LAST=$(git describe --tags --abbrev=0 --match 'v*' 2>/dev/null) +git log --no-merges "${LAST}..HEAD" --pretty=format:'%h %s' +``` + +Read the commit subjects; for any whose user impact is unclear, inspect the diff (`git show ` or `git diff "${LAST}..HEAD" -- `). Then **write the entry yourself** following the repo's conventions in `CLAUDE.md` → "Writing changelog entries": + +- Header: `## [X.Y.Z] - YYYY-MM-DD` (get the date with `date +%F`). +- Group under `### Added`, `### Changed`, `### Fixed`, `### Removed`, `### Deprecated`, `### Security` — **omit empty sections**. +- Write from the **user's perspective** (observable capability/symptom), not the implementation. Collapse noisy commits ("fix typo", "address review") into the feature they belong to or drop them. +- Plan the bottom link reference: `[X.Y.Z]: https://github.com/colbymchenry/codegraph/releases/tag/vX.Y.Z`. + +Do not write to any file yet — draft it for review first. + +### Step 5 — CONFIRMATION GATE + +Show the user, in chat: +1. The new version (`BASE` → `X.Y.Z`, minor/major). +2. The full drafted changelog entry. +3. The exact actions Steps 6–9 will take (commit + push + npm publish + GitHub release). + +Then **STOP**. Proceed only on explicit approval ("yes" / "proceed"). If the user requests prose changes, revise the draft and re-show. Do not run any command below until approved. + +### Step 6 — Write changelog, bump, build + +1. Use the **Edit** tool to insert the drafted `## [X.Y.Z]` block at the **top** of `CHANGELOG.md` (under the intro, above the previous version), and add the link reference with the other `[x.y.z]:` links at the bottom. +2. Bump (also updates `package-lock.json`; `--allow-same-version` keeps re-runs safe): + ```bash + npm version X.Y.Z --no-git-tag-version --allow-same-version + ``` +3. Build (fail fast before any push/publish): + ```bash + npm run build + ``` + +### Step 7 — Commit + push + +`release.sh` tags HEAD, so the bump must be committed first. + +```bash +git add package.json package-lock.json CHANGELOG.md +git commit -m "release: X.Y.Z" +git push +``` + +### Step 8 — Publish to npm + +```bash +npm publish --access public +``` + +### Step 9 — GitHub release + +`scripts/release.sh` reads the `## [X.Y.Z]` block from CHANGELOG.md, tags `vX.Y.Z`, pushes the tag, and creates the GitHub release. It is idempotent. + +```bash +./scripts/release.sh +``` + +### Step 10 — Verify + +Confirm against the **registry**, not the website (the website caches): + +```bash +npm view "$PKG" version # must equal X.Y.Z +``` + +Report the release URL (`scripts/release.sh` prints it) and the published version. + +## If something fails midway + +Re-running is safe: `npm version --allow-same-version` no-ops if already bumped, `git commit` skips if nothing's staged (check `git diff --cached --quiet`), `git push` no-ops if up to date, and `scripts/release.sh` skips tag/release steps already done. Re-run from the failed step. diff --git a/publish.js b/publish.js deleted file mode 100644 index cbbabd75..00000000 --- a/publish.js +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env node -const { execSync } = require('child_process'); -const fs = require('fs'); -const path = require('path'); -const readline = require('readline'); - -const PKG_PATH = path.join(__dirname, 'package.json'); -const pkg = JSON.parse(fs.readFileSync(PKG_PATH, 'utf-8')); -const [major, minor, patch] = pkg.version.split('.').map(Number); - -const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); - -function ask(question) { - return new Promise((resolve) => rl.question(question, resolve)); -} - -async function main() { - console.log(`\nCurrent version: ${pkg.version}\n`); - console.log(' 1) patch -> ' + `${major}.${minor}.${patch + 1}`); - console.log(' 2) minor -> ' + `${major}.${minor + 1}.0`); - console.log(' 3) major -> ' + `${major + 1}.0.0`); - console.log(''); - - const choice = await ask('Bump version (1/2/3): '); - - let bump; - switch (choice.trim()) { - case '1': bump = 'patch'; break; - case '2': bump = 'minor'; break; - case '3': bump = 'major'; break; - default: - console.log('Invalid choice. Exiting.'); - rl.close(); - process.exit(1); - } - - // Bump version in package.json - execSync(`npm version ${bump} --no-git-tag-version`, { stdio: 'inherit' }); - - const updated = JSON.parse(fs.readFileSync(PKG_PATH, 'utf-8')); - console.log(`\nVersion bumped to ${updated.version}`); - - const confirm = await ask(`Publish ${updated.name}@${updated.version} to npm? (y/n): `); - if (confirm.trim().toLowerCase() !== 'y') { - console.log('Aborted.'); - rl.close(); - process.exit(0); - } - - // Build and publish - console.log('\nBuilding...'); - execSync('npm run build', { stdio: 'inherit' }); - - console.log('\nPublishing...'); - execSync('npm publish --access public', { stdio: 'inherit' }); - - console.log(`\nPublished ${updated.name}@${updated.version}`); - rl.close(); -} - -main().catch((err) => { - console.error(err); - rl.close(); - process.exit(1); -}); diff --git a/run-interactive-test.md b/run-interactive-test.md new file mode 100644 index 00000000..448c9e62 --- /dev/null +++ b/run-interactive-test.md @@ -0,0 +1,131 @@ +# Running the agent-behavior test (how agents actually use codegraph) + +This explains how to measure **how a Claude Code agent uses the codegraph MCP +tools** on a real repo — which tools it calls (does it lead with +`codegraph_explore`?), how many follow-up `Read`/`Grep`s it does, and the token +cost. Use it when changing tool guidance (`server-instructions.ts`, +`instructions-template.ts`, tool descriptions) or retrieval, to verify the +change actually shifts agent behavior. + +Scripts live in `scripts/agent-eval/`. + +## Why two harnesses (read this first) + +| | Interactive (`itrun.sh`) | Headless (`run-agent.sh`) | +|---|---|---| +| Drives | the real TUI via tmux | `claude -p` print mode | +| Subagent it picks | **Explore** (matches real UX) | general-purpose (diverges) | +| Metrics | tool breakdown (from session logs) + `Done(…)` token summary | exact per-tool calls + tokens/cost (stream-json) | +| Cost | Claude Max subscription | API $ (`total_cost_usd`) | + +**Headless `claude -p` does NOT reproduce what users see** — it silently picks +the general-purpose subagent, while interactive sessions delegate to the +read-first **Explore** subagent. So for "what does my session actually do," use +the interactive harness. For a clean per-tool/token breakdown in one shot, use +headless (and ask for the Explore subagent in the prompt if you want that path). + +## Prerequisites + +- **tmux 3.0+** +- A logged-in `claude` CLI (Claude Max or API). +- codegraph configured as an MCP server (`claude mcp list` shows `codegraph`). + The interactive harness uses your global config, so it runs whatever + `codegraph` resolves to — point that at your dev build (`npm link` / the + symlinked global) to test local changes. +- A target repo, cloned and indexed: + ```bash + git clone --depth 1 https://github.com/square/okhttp /tmp/corpus/okhttp + cd /tmp/corpus/okhttp && codegraph init -i + ``` + Good scale spread for a sweep: Alamofire (~100 files), Excalidraw (~600), + OkHttp (~640), VS Code (~10k). + +## Interactive test (the faithful one) + +```bash +scripts/agent-eval/itrun.sh