Skip to content

Commit 4a8d2f0

Browse files
committed
feat: Add content-based C++ detection for .h headers
Addresses C++ classes missing from .h files where extension-based detection defaults to 'c' language which has no class extraction support. Adds looksLikeCpp() heuristic that scans first 8KB for C++-specific patterns (namespace, class, template, access specifiers) to promote .h files to 'cpp' language when C++ constructs are detected. Ensures cpp grammar is loaded alongside c to handle potential .h promotion during parsing.
1 parent 237fb3b commit 4a8d2f0

5 files changed

Lines changed: 36 additions & 11 deletions

File tree

docs/SEARCH_QUALITY_LOOP.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,7 @@ test().catch(console.error);
446446
| `qualified_name` missing class for nested methods | Extraction not walking parent stack correctly | `src/extraction/tree-sitter.ts: visitNode()` |
447447
| Import edges missing | `extractImport` returns null for this syntax | `src/extraction/languages/<lang>.ts: extractImport` |
448448
| C++ classes/structs/enums missing from macro namespaces | Macros like `NLOHMANN_JSON_NAMESPACE_BEGIN` cause tree-sitter to misparse namespace blocks as `function_definition` | `src/extraction/languages/c-cpp.ts: isMisparsedFunction` filters bad names; `src/extraction/tree-sitter.ts: visitFunctionBody` extracts structural nodes |
449+
| C++ classes missing from `.h` headers | `.h` files default to `c` language which has `classTypes: []` | `src/extraction/grammars.ts: looksLikeCpp()` — content-based heuristic promotes `.h` files to `cpp` when C++ patterns detected |
449450

450451
## After Fixing Issues
451452

@@ -526,7 +527,7 @@ if (receiverType) {
526527
- [x] **Python** — NOT needed. Methods nested in class body. Verified against Flask
527528
- [x] **Rust**`getReceiverType` walks up to parent `impl_item` to extract type name. Also adds `contains` edges from struct to impl methods. Verified against Deno
528529
- [x] **C** — NOT needed. No methods in C. Strong function/struct/enum extraction with excellent call edge density. Verified against Redis
529-
- [x] **C++** — NOT needed for header-only libs. `isMisparsedFunction` hook filters macro-caused misparse artifacts (e.g. `NLOHMANN_JSON_NAMESPACE_BEGIN`). `visitFunctionBody` now extracts structural nodes (classes/structs/enums) inside macro-confused "function" bodies. Verified against nlohmann/json. Note: out-of-class `Type::method()` definitions would need `getReceiverType` but are uncommon in header-only codebases.
530+
- [x] **C++** — NOT needed for header-only libs. `isMisparsedFunction` hook filters macro-caused misparse artifacts (e.g. `NLOHMANN_JSON_NAMESPACE_BEGIN`). `visitFunctionBody` now extracts structural nodes (classes/structs/enums) inside macro-confused "function" bodies. Content-based `.h` detection (`looksLikeCpp` in `grammars.ts`) promotes C++ headers to `cpp` language so classes in `.h` files are extracted. Verified against nlohmann/json and gRPC. Note: out-of-class `Type::method()` definitions would need `getReceiverType` but are uncommon in header-only codebases.
530531

531532
### Needs Verification
532533

src/extraction/grammars.ts

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,25 @@ export function getParser(language: Language): Parser | null {
174174
/**
175175
* Detect language from file extension
176176
*/
177-
export function detectLanguage(filePath: string): Language {
177+
export function detectLanguage(filePath: string, source?: string): Language {
178178
const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase();
179-
return EXTENSION_MAP[ext] || 'unknown';
179+
const lang = EXTENSION_MAP[ext] || 'unknown';
180+
181+
// .h files could be C or C++ — check source content for C++ features
182+
if (lang === 'c' && ext === '.h' && source) {
183+
if (looksLikeCpp(source)) return 'cpp';
184+
}
185+
186+
return lang;
187+
}
188+
189+
/**
190+
* Heuristic: does a .h file contain C++ constructs?
191+
* Checks the first ~8KB for patterns that are unique to C++ and never valid C.
192+
*/
193+
function looksLikeCpp(source: string): boolean {
194+
const sample = source.substring(0, 8192);
195+
return /\bnamespace\b|\bclass\s+\w+\s*[:{]|\btemplate\s*<|\b(?:public|private|protected)\s*:|\bvirtual\b|\busing\s+(?:namespace\b|\w+\s*=)/.test(sample);
180196
}
181197

182198
/**

src/extraction/index.ts

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,10 @@ export class ExtractionOrchestrator {
472472

473473
// Detect needed languages and load grammars in the parse worker
474474
const neededLanguages = [...new Set(files.map((f) => detectLanguage(f)))];
475+
// .h files default to 'c' but may be C++ — ensure cpp grammar is loaded when c is needed
476+
if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
477+
neededLanguages.push('cpp');
478+
}
475479

476480
// Try to use a worker thread for parsing (keeps main thread unblocked for UI).
477481
// Falls back to in-process parsing if the compiled worker is unavailable (e.g. tests).
@@ -580,7 +584,7 @@ export class ExtractionOrchestrator {
580584
async function requestParse(filePath: string, content: string): Promise<ExtractionResult> {
581585
if (!WorkerClass) {
582586
// In-process fallback
583-
return extractFromSource(filePath, content, detectLanguage(filePath));
587+
return extractFromSource(filePath, content, detectLanguage(filePath, content));
584588
}
585589

586590
// Recycle the worker before the next parse if we've hit the threshold.
@@ -706,7 +710,7 @@ export class ExtractionOrchestrator {
706710

707711
// Store in database on main thread (SQLite is not thread-safe)
708712
if (result.nodes.length > 0 || result.errors.length === 0) {
709-
const language = detectLanguage(filePath);
713+
const language = detectLanguage(filePath, content);
710714
this.storeExtractionResult(filePath, content, language, stats, result);
711715
}
712716

@@ -779,7 +783,7 @@ export class ExtractionOrchestrator {
779783
}
780784

781785
if (result.nodes.length > 0 || result.errors.length === 0) {
782-
const language = detectLanguage(filePath);
786+
const language = detectLanguage(filePath, content);
783787
const stats = await fsp.stat(path.join(this.rootDir, filePath));
784788
this.storeExtractionResult(filePath, content, language, stats, result);
785789

@@ -830,7 +834,7 @@ export class ExtractionOrchestrator {
830834
}
831835

832836
if (result.nodes.length > 0 || result.errors.length === 0) {
833-
const language = detectLanguage(filePath);
837+
const language = detectLanguage(filePath, fullContent);
834838
const stats = await fsp.stat(path.join(this.rootDir, filePath));
835839
this.storeExtractionResult(filePath, fullContent, language, stats, result);
836840

@@ -989,7 +993,7 @@ export class ExtractionOrchestrator {
989993
}
990994

991995
// Detect language
992-
const language = detectLanguage(relativePath);
996+
const language = detectLanguage(relativePath, content);
993997
if (!isLanguageSupported(language)) {
994998
return {
995999
nodes: [],
@@ -1201,6 +1205,10 @@ export class ExtractionOrchestrator {
12011205
// Load only grammars needed for changed files
12021206
if (filesToIndex.length > 0) {
12031207
const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f)))];
1208+
// .h files default to 'c' but may be C++ — ensure cpp grammar is loaded
1209+
if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
1210+
neededLanguages.push('cpp');
1211+
}
12041212
await loadGrammarsForLanguages(neededLanguages);
12051213
}
12061214

src/extraction/parse-worker.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ parentPort!.on('message', async (msg: { type: string; id?: number; filePath?: st
2020
} else if (msg.type === 'parse') {
2121
const { id, filePath, content } = msg;
2222
try {
23-
const language = detectLanguage(filePath!);
23+
const language = detectLanguage(filePath!, content);
2424
const result: ExtractionResult = extractFromSource(filePath!, content!, language);
2525

2626
// Periodic parser reset to reclaim WASM heap memory

src/extraction/tree-sitter.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ export class TreeSitterExtractor {
106106
constructor(filePath: string, source: string, language?: Language) {
107107
this.filePath = filePath;
108108
this.source = source;
109-
this.language = language || detectLanguage(filePath);
109+
this.language = language || detectLanguage(filePath, source);
110110
this.extractor = EXTRACTORS[this.language] || null;
111111
}
112112

@@ -2087,7 +2087,7 @@ export function extractFromSource(
20872087
source: string,
20882088
language?: Language
20892089
): ExtractionResult {
2090-
const detectedLanguage = language || detectLanguage(filePath);
2090+
const detectedLanguage = language || detectLanguage(filePath, source);
20912091
const fileExtension = path.extname(filePath).toLowerCase();
20922092

20932093
// Use custom extractor for Svelte

0 commit comments

Comments
 (0)