Skip to content

Commit 07c093c

Browse files
colbymchenryclaude
andauthored
fix(extraction): index nested non-submodule git repos (colbymchenry#193) (colbymchenry#217)
`codegraph init -i` from a git super-repo containing independent nested git repositories (not submodules) reported "No files found to index": git ls-files reports an embedded repo only as an opaque `subdir/` entry and never lists its files. Detect embedded repos via that trailing-slash signal and recurse `git ls-files` into each, indexing tracked + untracked source and honoring each repo's own .gitignore. Reported by @timxx. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 9b06b0e commit 07c093c

3 files changed

Lines changed: 138 additions & 26 deletions

File tree

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,17 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6363
Thanks to [@essopsp](https://github.com/essopsp) for the repro.
6464

6565
### Fixed
66+
- **Indexing**: `codegraph init -i` now finds source inside nested, independent
67+
git repositories — separate clones living inside the workspace that are **not**
68+
git submodules (common in CMake "super-repo" layouts). When the top-level
69+
workspace is itself a git repo, `git ls-files` reports an embedded repo only as
70+
an opaque `subdir/` entry and never lists its files, so indexing from the
71+
workspace root reported "No files found to index" even though indexing each
72+
sub-repo individually worked. CodeGraph now detects these embedded repos and
73+
indexes their tracked and untracked source, honoring each repo's own
74+
`.gitignore`. Closes
75+
[#193](https://github.com/colbymchenry/codegraph/issues/193). Thanks to
76+
[@timxx](https://github.com/timxx) for the report.
6677
- **Native SQLite backend on Node 24**: indexing on Node 24 always dropped to
6778
the 5-10x-slower WASM backend, printing a `better-sqlite3 unavailable`
6879
warning that `npm rebuild better-sqlite3` / `xcode-select --install` could

__tests__/extraction.test.ts

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3132,6 +3132,79 @@ describe('Git Submodules', () => {
31323132
});
31333133
});
31343134

3135+
describe('Nested non-submodule git repos', () => {
3136+
let tempDir: string;
3137+
3138+
beforeEach(() => {
3139+
tempDir = createTempDir();
3140+
});
3141+
3142+
afterEach(() => {
3143+
cleanupTempDir(tempDir);
3144+
});
3145+
3146+
it('should index files in embedded git repos run from a git super-repo (issue #193)', async () => {
3147+
const { execFileSync } = await import('child_process');
3148+
const git = (cwd: string, ...args: string[]) =>
3149+
execFileSync('git', args, { cwd, stdio: 'pipe' });
3150+
3151+
// Top-level workspace is itself a git repo, holding no source directly —
3152+
// the CMake "super-repo" layout from the issue.
3153+
const root = path.join(tempDir, 'root');
3154+
fs.mkdirSync(path.join(root, 'coding'), { recursive: true });
3155+
git(root, 'init', '-q');
3156+
git(root, 'config', 'user.email', 'test@test.com');
3157+
git(root, 'config', 'user.name', 'Test');
3158+
fs.writeFileSync(path.join(root, 'CMakeLists.txt'), 'cmake_minimum_required(VERSION 3.10)\n');
3159+
3160+
// Two independent clones living inside the workspace (NOT submodules):
3161+
// one with committed source, one with only untracked source.
3162+
const sub1 = path.join(root, 'sub_repo1', 'src');
3163+
fs.mkdirSync(sub1, { recursive: true });
3164+
git(path.join(root, 'sub_repo1'), 'init', '-q');
3165+
git(path.join(root, 'sub_repo1'), 'config', 'user.email', 'test@test.com');
3166+
git(path.join(root, 'sub_repo1'), 'config', 'user.name', 'Test');
3167+
fs.writeFileSync(path.join(sub1, 'one.ts'), 'export const one = 1;');
3168+
git(path.join(root, 'sub_repo1'), 'add', '-A');
3169+
git(path.join(root, 'sub_repo1'), 'commit', '-q', '-m', 'sub1 init');
3170+
3171+
const sub2 = path.join(root, 'sub_repo2', 'src');
3172+
fs.mkdirSync(sub2, { recursive: true });
3173+
git(path.join(root, 'sub_repo2'), 'init', '-q');
3174+
fs.writeFileSync(path.join(sub2, 'two.ts'), 'export const two = 2;');
3175+
3176+
const config = { ...DEFAULT_CONFIG, rootDir: root };
3177+
const files = scanDirectory(root, config);
3178+
3179+
// Both committed and untracked source from the nested repos must be found.
3180+
expect(files).toContain('sub_repo1/src/one.ts');
3181+
expect(files).toContain('sub_repo2/src/two.ts');
3182+
});
3183+
3184+
it('should respect each embedded repo\'s own .gitignore', async () => {
3185+
const { execFileSync } = await import('child_process');
3186+
const git = (cwd: string, ...args: string[]) =>
3187+
execFileSync('git', args, { cwd, stdio: 'pipe' });
3188+
3189+
const root = path.join(tempDir, 'root');
3190+
fs.mkdirSync(root, { recursive: true });
3191+
git(root, 'init', '-q');
3192+
3193+
const sub = path.join(root, 'sub_repo', 'src');
3194+
fs.mkdirSync(sub, { recursive: true });
3195+
git(path.join(root, 'sub_repo'), 'init', '-q');
3196+
fs.writeFileSync(path.join(root, 'sub_repo', '.gitignore'), 'src/generated.ts\n');
3197+
fs.writeFileSync(path.join(sub, 'real.ts'), 'export const real = 1;');
3198+
fs.writeFileSync(path.join(sub, 'generated.ts'), 'export const generated = 1;');
3199+
3200+
const config = { ...DEFAULT_CONFIG, rootDir: root };
3201+
const files = scanDirectory(root, config);
3202+
3203+
expect(files).toContain('sub_repo/src/real.ts');
3204+
expect(files).not.toContain('sub_repo/src/generated.ts');
3205+
});
3206+
});
3207+
31353208
// =============================================================================
31363209
// Scala
31373210
// =============================================================================

src/extraction/index.ts

Lines changed: 54 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -125,10 +125,61 @@ export function shouldIncludeFile(
125125
return false;
126126
}
127127

128+
/**
129+
* Collect git-visible files (tracked + untracked, .gitignore-respected) from the
130+
* git repository rooted at `repoDir`, adding each to `files` with `prefix`
131+
* prepended so paths stay relative to the original scan root.
132+
*
133+
* Recurses into embedded git repositories — nested repos that are NOT submodules
134+
* (independent clones living inside the workspace, common in CMake "super-repo"
135+
* layouts). The parent repo's `git ls-files` cannot see into them: tracked output
136+
* skips them entirely, and untracked output reports them only as an opaque
137+
* "subdir/" entry (trailing slash) rather than expanding their files. Each
138+
* embedded repo is its own git boundary, so we re-run `git ls-files` inside it.
139+
* (See issue #193.)
140+
*/
141+
function collectGitFiles(repoDir: string, prefix: string, files: Set<string>): void {
142+
const gitOpts = { cwd: repoDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'] };
143+
144+
// Tracked files. --recurse-submodules pulls in files from active submodules,
145+
// which the index would otherwise represent only as a commit pointer.
146+
// Without this, monorepos using submodules index 0 files. (See issue #147.)
147+
// Note: --recurse-submodules only supports -c/--cached and --stage modes — it
148+
// can't be combined with -o, so untracked files are gathered separately below.
149+
const tracked = execFileSync('git', ['ls-files', '-c', '--recurse-submodules'], gitOpts);
150+
for (const line of tracked.split('\n')) {
151+
const trimmed = line.trim();
152+
if (trimmed) {
153+
files.add(normalizePath(prefix + trimmed));
154+
}
155+
}
156+
157+
// Untracked files (submodules manage their own untracked state). Embedded git
158+
// repos surface here as a single "subdir/" entry that git refuses to descend
159+
// into — recurse into those as their own repos so their source gets indexed.
160+
const untracked = execFileSync('git', ['ls-files', '-o', '--exclude-standard'], gitOpts);
161+
for (const line of untracked.split('\n')) {
162+
const trimmed = line.trim();
163+
if (!trimmed) continue;
164+
if (trimmed.endsWith('/')) {
165+
// git only emits a trailing-slash directory entry for an embedded repo.
166+
// Guard with a .git check anyway, and skip anything else exactly as git
167+
// itself skips it (we never descend into a non-repo opaque dir).
168+
const childDir = path.join(repoDir, trimmed);
169+
if (fs.existsSync(path.join(childDir, '.git'))) {
170+
collectGitFiles(childDir, prefix + trimmed, files);
171+
}
172+
continue;
173+
}
174+
files.add(normalizePath(prefix + trimmed));
175+
}
176+
}
177+
128178
/**
129179
* Get all files visible to git (tracked + untracked but not ignored).
130-
* Respects .gitignore at all levels (root, subdirectories).
131-
* Returns null on failure (non-git project) so callers can fall back.
180+
* Respects .gitignore at all levels (root, subdirectories) and descends into
181+
* embedded (nested, non-submodule) git repos. Returns null on failure
182+
* (non-git project) so callers can fall back to a filesystem walk.
132183
*/
133184
function getGitVisibleFiles(rootDir: string): Set<string> | null {
134185
try {
@@ -157,30 +208,7 @@ function getGitVisibleFiles(rootDir: string): Set<string> | null {
157208
}
158209

159210
const files = new Set<string>();
160-
const gitOpts = { cwd: rootDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'] };
161-
162-
// Tracked files. --recurse-submodules pulls in files from active submodules,
163-
// which the main repo's index would otherwise represent only as a commit pointer.
164-
// Without this, monorepos using submodules index 0 files. (See issue #147.)
165-
// Note: --recurse-submodules only supports -c/--cached and --stage modes — it
166-
// can't be combined with -o, so untracked files are gathered separately below.
167-
const tracked = execFileSync('git', ['ls-files', '-c', '--recurse-submodules'], gitOpts);
168-
for (const line of tracked.split('\n')) {
169-
const trimmed = line.trim();
170-
if (trimmed) {
171-
files.add(normalizePath(trimmed));
172-
}
173-
}
174-
175-
// Untracked files in the main repo (submodules manage their own untracked state).
176-
const untracked = execFileSync('git', ['ls-files', '-o', '--exclude-standard'], gitOpts);
177-
for (const line of untracked.split('\n')) {
178-
const trimmed = line.trim();
179-
if (trimmed) {
180-
files.add(normalizePath(trimmed));
181-
}
182-
}
183-
211+
collectGitFiles(rootDir, '', files);
184212
return files;
185213
} catch {
186214
return null;

0 commit comments

Comments
 (0)