Skip to content

Commit e4908e1

Browse files
committed
feat: Add database schema v3 with optimized node lookups and improved error handling
Adds expression index on lower(name) for memory-efficient case-insensitive searches, replacing in-memory caches that caused OOM on large codebases. Includes batched reference resolution, enhanced error reporting with detailed breakdown by error type, and improved CLI progress display for scanning phases.
1 parent 9cd5ef9 commit e4908e1

15 files changed

Lines changed: 421 additions & 141 deletions

__tests__/foundation.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ describe('Database Connection', () => {
317317

318318
const version = db.getSchemaVersion();
319319
expect(version).not.toBeNull();
320-
expect(version?.version).toBe(2);
320+
expect(version?.version).toBe(3);
321321

322322
db.close();
323323
});

__tests__/pr19-improvements.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => {
299299
describe('Schema v2 Migration', () => {
300300
it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => {
301301
const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations');
302-
expect(CURRENT_SCHEMA_VERSION).toBe(2);
302+
expect(CURRENT_SCHEMA_VERSION).toBe(3);
303303
});
304304

305305
it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => {

src/bin/codegraph.ts

Lines changed: 131 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -192,11 +192,16 @@ function printProgress(progress: IndexProgress): void {
192192
};
193193
194194
const phaseName = phaseNames[progress.phase] || progress.phase;
195-
const bar = progressBar(progress.current, progress.total);
196195
const file = progress.currentFile ? chalk.dim(` ${progress.currentFile}`) : '';
197196
198-
// Clear line and print progress
199-
process.stdout.write(`\r${chalk.cyan(phaseName)}: ${bar}${file}`.padEnd(100));
197+
if (progress.total > 0) {
198+
const bar = progressBar(progress.current, progress.total);
199+
process.stdout.write(`\r${chalk.cyan(phaseName)}: ${bar}${file}`.padEnd(100));
200+
} else {
201+
// No known total (e.g. scanning) — show a running count
202+
const count = progress.current > 0 ? ` ${chalk.green(formatNumber(progress.current))} found` : '';
203+
process.stdout.write(`\r${chalk.cyan(phaseName)}:${count}${file}`.padEnd(100));
204+
}
200205
}
201206
202207
/**
@@ -227,6 +232,121 @@ function warn(message: string): void {
227232
console.log(chalk.yellow('⚠') + ' ' + message);
228233
}
229234
235+
/**
236+
* Print a summary of indexing results with clear error breakdown
237+
*/
238+
function printIndexResult(result: { success: boolean; filesIndexed: number; filesSkipped: number; filesErrored: number; nodesCreated: number; edgesCreated: number; errors: Array<{ message: string; filePath?: string; severity: string; code?: string }>; durationMs: number }, projectPath?: string): void {
239+
const hasErrors = result.filesErrored > 0;
240+
241+
// Always show what was indexed
242+
if (result.filesIndexed > 0) {
243+
if (hasErrors) {
244+
success(`Indexed ${formatNumber(result.filesIndexed)} files (${formatNumber(result.filesErrored)} could not be parsed)`);
245+
} else {
246+
success(`Indexed ${formatNumber(result.filesIndexed)} files`);
247+
}
248+
info(`Created ${formatNumber(result.nodesCreated)} nodes and ${formatNumber(result.edgesCreated)} edges`);
249+
info(`Completed in ${formatDuration(result.durationMs)}`);
250+
} else if (hasErrors) {
251+
error(`Indexing failed all ${formatNumber(result.filesErrored)} files had errors`);
252+
} else {
253+
warn('No files found to index');
254+
}
255+
256+
// Show error breakdown if there were errors
257+
if (hasErrors) {
258+
// Group errors by code for a concise summary
259+
const errorsByCode = new Map<string, number>();
260+
for (const err of result.errors) {
261+
if (err.severity === 'error') {
262+
const code = err.code || 'unknown';
263+
errorsByCode.set(code, (errorsByCode.get(code) || 0) + 1);
264+
}
265+
}
266+
267+
const codeLabels: Record<string, string> = {
268+
parse_error: 'files failed to parse',
269+
read_error: 'files could not be read',
270+
size_exceeded: 'files exceeded size limit',
271+
path_traversal: 'blocked paths',
272+
unsupported_language: 'unsupported language',
273+
parser_error: 'parser initialization failures',
274+
};
275+
276+
console.log('');
277+
console.log(chalk.dim(' Error breakdown:'));
278+
for (const [code, count] of errorsByCode) {
279+
const label = codeLabels[code] || code;
280+
console.log(chalk.dim(` ${formatNumber(count)} ${label}`));
281+
}
282+
283+
// Write detailed error log to .codegraph/errors.log
284+
if (projectPath) {
285+
writeErrorLog(projectPath, result.errors);
286+
}
287+
288+
// Reassure the user the index is usable
289+
if (result.filesIndexed > 0) {
290+
console.log('');
291+
info('The index is fully usable — only the failed files are missing from the graph.');
292+
info('This is common in large repos with test fixtures or generated files that use non-standard syntax.');
293+
}
294+
} else if (projectPath) {
295+
// No errors — clean up any stale error log
296+
const logPath = path.join(projectPath, '.codegraph', 'errors.log');
297+
if (fs.existsSync(logPath)) {
298+
fs.unlinkSync(logPath);
299+
}
300+
}
301+
}
302+
303+
/**
304+
* Write detailed error log to .codegraph/errors.log
305+
*/
306+
function writeErrorLog(projectPath: string, errors: Array<{ message: string; filePath?: string; severity: string; code?: string }>): void {
307+
const cgDir = path.join(projectPath, '.codegraph');
308+
if (!fs.existsSync(cgDir)) return;
309+
310+
const logPath = path.join(cgDir, 'errors.log');
311+
312+
// Group errors by file path
313+
const errorsByFile = new Map<string, Array<{ message: string; code?: string }>>();
314+
const noFileErrors: Array<{ message: string; code?: string }> = [];
315+
316+
for (const err of errors) {
317+
if (err.severity !== 'error') continue;
318+
if (err.filePath) {
319+
let list = errorsByFile.get(err.filePath);
320+
if (!list) {
321+
list = [];
322+
errorsByFile.set(err.filePath, list);
323+
}
324+
list.push({ message: err.message, code: err.code });
325+
} else {
326+
noFileErrors.push({ message: err.message, code: err.code });
327+
}
328+
}
329+
330+
const lines: string[] = [
331+
`CodeGraph Error Log ${new Date().toISOString()}`,
332+
`${errorsByFile.size} files with errors`,
333+
'',
334+
];
335+
336+
for (const [filePath, fileErrors] of errorsByFile) {
337+
for (const err of fileErrors) {
338+
lines.push(`${filePath}: ${err.message}`);
339+
}
340+
}
341+
342+
for (const err of noFileErrors) {
343+
lines.push(err.message);
344+
}
345+
346+
fs.writeFileSync(logPath, lines.join('\n') + '\n');
347+
info(`See .codegraph/errors.log for the full list of failed files`);
348+
}
349+
230350
// =============================================================================
231351
// Commands
232352
// =============================================================================
@@ -239,7 +359,8 @@ program
239359
.description('Initialize CodeGraph in a project directory')
240360
.option('-i, --index', 'Run initial indexing after initialization')
241361
.action(async (pathArg: string | undefined, options: { index?: boolean }) => {
242-
const projectPath = resolveProjectPath(pathArg);
362+
// init should always target the exact path given (or cwd), never walk up parents
363+
const projectPath = path.resolve(pathArg || process.cwd());
243364
244365
console.log(chalk.bold('\nInitializing CodeGraph...\n'));
245366
@@ -271,13 +392,7 @@ program
271392
// Clear progress line
272393
process.stdout.write('\r' + ' '.repeat(100) + '\r');
273394
274-
if (result.success) {
275-
success(`Indexed ${formatNumber(result.filesIndexed)} files`);
276-
info(`Created ${formatNumber(result.nodesCreated)} nodes and ${formatNumber(result.edgesCreated)} edges`);
277-
info(`Completed in ${formatDuration(result.durationMs)}`);
278-
} else {
279-
warn(`Indexing completed with ${result.errors.length} errors`);
280-
}
395+
printIndexResult(result, projectPath);
281396
} else {
282397
info('Run "codegraph index" to index the project');
283398
}
@@ -376,22 +491,11 @@ program
376491
process.stdout.write('\r' + ' '.repeat(100) + '\r');
377492
}
378493
379-
if (result.success) {
380-
if (!options.quiet) {
381-
success(`Indexed ${formatNumber(result.filesIndexed)} files`);
382-
info(`Created ${formatNumber(result.nodesCreated)} nodes and ${formatNumber(result.edgesCreated)} edges`);
383-
info(`Completed in ${formatDuration(result.durationMs)}`);
384-
}
385-
} else {
386-
if (!options.quiet) {
387-
warn(`Indexing completed with ${result.errors.length} errors`);
388-
for (const err of result.errors.slice(0, 5)) {
389-
console.log(chalk.dim(` - ${err.message}`));
390-
}
391-
if (result.errors.length > 5) {
392-
console.log(chalk.dim(` ... and ${result.errors.length - 5} more`));
393-
}
394-
}
494+
if (!options.quiet) {
495+
printIndexResult(result, projectPath);
496+
}
497+
498+
if (!result.success) {
395499
process.exit(1);
396500
}
397501

src/db/migrations.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import { SqliteDatabase } from './sqlite-adapter';
99
/**
1010
* Current schema version
1111
*/
12-
export const CURRENT_SCHEMA_VERSION = 2;
12+
export const CURRENT_SCHEMA_VERSION = 3;
1313

1414
/**
1515
* Migration definition
@@ -45,6 +45,15 @@ const migrations: Migration[] = [
4545
`);
4646
},
4747
},
48+
{
49+
version: 3,
50+
description: 'Add lower(name) expression index for memory-efficient case-insensitive lookups',
51+
up: (db) => {
52+
db.exec(`
53+
CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name));
54+
`);
55+
},
56+
},
4857
];
4958

5059
/**

src/db/queries.ts

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,12 @@ export class QueryBuilder {
172172
insertUnresolved?: SqliteStatement;
173173
deleteUnresolvedByNode?: SqliteStatement;
174174
getUnresolvedByName?: SqliteStatement;
175+
getNodesByName?: SqliteStatement;
176+
getNodesByQualifiedNameExact?: SqliteStatement;
177+
getNodesByLowerName?: SqliteStatement;
178+
getUnresolvedCount?: SqliteStatement;
179+
getUnresolvedBatch?: SqliteStatement;
180+
getAllFilePaths?: SqliteStatement;
175181
} = {};
176182

177183
constructor(db: SqliteDatabase) {
@@ -425,6 +431,43 @@ export class QueryBuilder {
425431
return rows.map(rowToNode);
426432
}
427433

434+
/**
435+
* Get nodes by exact name match (uses idx_nodes_name index)
436+
*/
437+
getNodesByName(name: string): Node[] {
438+
if (!this.stmts.getNodesByName) {
439+
this.stmts.getNodesByName = this.db.prepare('SELECT * FROM nodes WHERE name = ?');
440+
}
441+
const rows = this.stmts.getNodesByName.all(name) as NodeRow[];
442+
return rows.map(rowToNode);
443+
}
444+
445+
/**
446+
* Get nodes by exact qualified name match (uses idx_nodes_qualified_name index)
447+
*/
448+
getNodesByQualifiedNameExact(qualifiedName: string): Node[] {
449+
if (!this.stmts.getNodesByQualifiedNameExact) {
450+
this.stmts.getNodesByQualifiedNameExact = this.db.prepare(
451+
'SELECT * FROM nodes WHERE qualified_name = ?'
452+
);
453+
}
454+
const rows = this.stmts.getNodesByQualifiedNameExact.all(qualifiedName) as NodeRow[];
455+
return rows.map(rowToNode);
456+
}
457+
458+
/**
459+
* Get nodes by lowercase name match (uses idx_nodes_lower_name expression index)
460+
*/
461+
getNodesByLowerName(lowerName: string): Node[] {
462+
if (!this.stmts.getNodesByLowerName) {
463+
this.stmts.getNodesByLowerName = this.db.prepare(
464+
'SELECT * FROM nodes WHERE lower(name) = ?'
465+
);
466+
}
467+
const rows = this.stmts.getNodesByLowerName.all(lowerName) as NodeRow[];
468+
return rows.map(rowToNode);
469+
}
470+
428471
/**
429472
* Search nodes by name using FTS with fallback to LIKE for better matching
430473
*
@@ -886,6 +929,53 @@ export class QueryBuilder {
886929
}));
887930
}
888931

932+
/**
933+
* Get the count of unresolved references without loading them into memory
934+
*/
935+
getUnresolvedReferencesCount(): number {
936+
if (!this.stmts.getUnresolvedCount) {
937+
this.stmts.getUnresolvedCount = this.db.prepare(
938+
'SELECT COUNT(*) as count FROM unresolved_refs'
939+
);
940+
}
941+
const row = this.stmts.getUnresolvedCount.get() as { count: number };
942+
return row.count;
943+
}
944+
945+
/**
946+
* Get a batch of unresolved references using LIMIT/OFFSET pagination.
947+
* Used to process references in bounded memory chunks.
948+
*/
949+
getUnresolvedReferencesBatch(offset: number, limit: number): UnresolvedReference[] {
950+
if (!this.stmts.getUnresolvedBatch) {
951+
this.stmts.getUnresolvedBatch = this.db.prepare(
952+
'SELECT * FROM unresolved_refs LIMIT ? OFFSET ?'
953+
);
954+
}
955+
const rows = this.stmts.getUnresolvedBatch.all(limit, offset) as UnresolvedRefRow[];
956+
return rows.map((row) => ({
957+
fromNodeId: row.from_node_id,
958+
referenceName: row.reference_name,
959+
referenceKind: row.reference_kind as EdgeKind,
960+
line: row.line,
961+
column: row.col,
962+
candidates: row.candidates ? safeJsonParse(row.candidates, undefined) : undefined,
963+
filePath: row.file_path,
964+
language: row.language as Language,
965+
}));
966+
}
967+
968+
/**
969+
* Get all tracked file paths (lightweight — no full FileRecord objects)
970+
*/
971+
getAllFilePaths(): string[] {
972+
if (!this.stmts.getAllFilePaths) {
973+
this.stmts.getAllFilePaths = this.db.prepare('SELECT path FROM files ORDER BY path');
974+
}
975+
const rows = this.stmts.getAllFilePaths.all() as Array<{ path: string }>;
976+
return rows.map((r) => r.path);
977+
}
978+
889979
/**
890980
* Get unresolved references scoped to specific file paths.
891981
* Uses the idx_unresolved_file_path index for efficient lookup.

src/db/schema.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ CREATE INDEX IF NOT EXISTS idx_nodes_qualified_name ON nodes(qualified_name);
9191
CREATE INDEX IF NOT EXISTS idx_nodes_file_path ON nodes(file_path);
9292
CREATE INDEX IF NOT EXISTS idx_nodes_language ON nodes(language);
9393
CREATE INDEX IF NOT EXISTS idx_nodes_file_line ON nodes(file_path, start_line);
94+
CREATE INDEX IF NOT EXISTS idx_nodes_lower_name ON nodes(lower(name));
9495

9596
-- Full-text search index on node names, docstrings, and signatures
9697
CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(

src/extraction/dfm-extractor.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ export class DfmExtractor {
3939
this.errors.push({
4040
message: `DFM extraction error: ${error instanceof Error ? error.message : String(error)}`,
4141
severity: 'error',
42+
code: 'parse_error',
4243
});
4344
}
4445

0 commit comments

Comments
 (0)