diff --git a/src/bin/codegraph.ts b/src/bin/codegraph.ts index fb542987e..26e9cb97a 100644 --- a/src/bin/codegraph.ts +++ b/src/bin/codegraph.ts @@ -20,6 +20,7 @@ * codegraph callees Find what a function/method calls * codegraph impact Analyze what code is affected by changing a symbol * codegraph affected [files] Find test files affected by changes + * codegraph specify [files] Build a knowledge graph for specific files + their transitive import dependencies */ import { Command } from 'commander'; @@ -32,6 +33,7 @@ import { getGlyphs } from '../ui/glyphs'; import { buildNode25BlockBanner, buildNodeTooOldBanner, MIN_NODE_MAJOR } from './node-version-check'; import { relaunchWithWasmRuntimeFlagsIfNeeded } from '../extraction/wasm-runtime-flags'; +import { isSourceFile, getSupportedExtensions } from '../extraction'; // Lazy-load heavy modules (CodeGraph, runInstaller) to keep CLI startup fast. async function loadCodeGraph(): Promise { @@ -110,12 +112,12 @@ process.on('unhandledRejection', (reason) => { function main() { -const program = new Command(); - -// Version from package.json -const packageJson = JSON.parse( - fs.readFileSync(path.join(__dirname, '..', '..', 'package.json'), 'utf-8') -); +// ============================================================================= +// Helper: Normalize a file path to forward-slash relative form (matching DB storage). +// ============================================================================= +function normalizePath(p: string): string { + return p.replace(/\\/g, '/'); +} // ============================================================================= // ANSI Color Helpers (avoid chalk ESM issues) @@ -146,6 +148,13 @@ const chalk = { gray: (s: string) => `${colors.gray}${s}${colors.reset}`, }; +const program = new Command(); + +// Version from package.json +const packageJson = JSON.parse( + fs.readFileSync(path.join(__dirname, '..', '..', 'package.json'), 'utf-8') +); + program .name('codegraph') .description('Code intelligence and knowledge graph for any codebase') @@ -1605,6 +1614,281 @@ program } }); +/** + * codegraph specify [files...] [options] + * + * Build a knowledge graph for specific files and their transitive + * import dependencies — instead of indexing the entire repository. + * + * Usage: + * codegraph specify src/lib/a.ts src/lib/b.ts (local install) + * codegraph specify --path /some/project src/lib/a.ts (specified project root) + */ +program + .command('specify [files...]') + .description('Build a knowledge graph for specific files and their transitive import dependencies') + .option('-p, --path ', 'Project root path (current dir if omitted)') + .option('--depth ', 'Max dependency discovery depth (0 = specified files only)', '10') + .option('-f, --filter ', 'Only discover files matching this glob (e.g. "src/**/*.ts")') + .option('-j, --json', 'Output as JSON') + .option('-q, --quiet', 'Only output file paths, no decoration') + .action(async (fileArgs: string[], options: { path?: string; depth?: string; filter?: string; json?: boolean; quiet?: boolean }) => { + const projectRoot = path.resolve(options.path || process.cwd()); + const maxDepth = parseInt(options.depth || '10', 10); + const globFilter = options.filter ? new RegExp('^' + options.filter.replace(/[.+^${}()|[\]\\]/g, '\\$&').replace(/\*\*/g, '{{GLOBSTAR}}').replace(/\*/g, '[^/]*').replace(/\{\{GLOBSTAR\}\}/g, '.*') + '$') : null; + + try { + // 1. Collect user-specified files, resolve to relative paths + const specifiedFiles: string[] = []; + const errors: string[] = []; + + for (const fileArg of fileArgs) { + // Normalize forward slashes to back slashes for reliable Windows path handling + const normalized = fileArg.replace(/\//g, '\\'); + const absolute = path.resolve(normalized); + // Check file exists + if (!fs.existsSync(absolute)) { + errors.push(`File not found: ${fileArg}`); + continue; + } + // Check it's under the project root + if (!absolute.startsWith(projectRoot + path.sep) && absolute !== projectRoot) { + errors.push(`File outside project root: ${fileArg}`); + continue; + } + const relative = normalizePath(path.relative(projectRoot, absolute)); + if (!isSourceFile(relative)) { + errors.push(`Not a source file (skipped): ${relative}`); + continue; + } + specifiedFiles.push(relative); + } + + if (errors.length > 0 && specifiedFiles.length === 0) { + for (const e of errors) { + error(e); + } + process.exit(1); + } + + if (specifiedFiles.length === 0) { + if (!options.quiet) info('No valid files provided. Use file arguments.'); + process.exit(0); + } + + if (!options.quiet) { + console.log(chalk.bold('CodeGraph — Specify Files Graph')); + console.log(chalk.dim('─────────────────────────────────')); + console.log(chalk.blue(`Specified files: ${specifiedFiles.length}`)); + for (const f of specifiedFiles) { + console.log(chalk.cyan(` ${f}`)); + } + } + + // 2. Initialize CodeGraph (or open existing) + let initialized = isInitialized(projectRoot); + if (!initialized) { + if (!options.quiet) info('Project not initialized — creating temporary index...'); + } + + const { default: CodeGraph } = await loadCodeGraph(); + let cg: any = null; + let wasCreated = false; + + if (!initialized) { + cg = await CodeGraph.init(projectRoot, { index: false }); + wasCreated = true; + } else { + cg = await CodeGraph.open(projectRoot); + } + + // 3. Discover transitive dependencies + // Strategy: for each file, extract it via tree-sitter to get its + // import module names, then resolve those module names to files on disk. + // This reuses the same extraction pipeline (grammars/extractors) as + // `indexAll`, so it works for ALL supported languages without ad-hoc regex. + const allFiles = new Set(specifiedFiles); + + let currentDepth = 0; + let frontier = new Set(specifiedFiles); + + // Helper: resolve an import module name / relative path to a file path + function resolveImportToDep(importPath: string, fromFile: string): string | null { + const fromDir = path.dirname(path.resolve(projectRoot, fromFile)); + const resolvedRaw = importPath.startsWith('/') + ? importPath + : path.resolve(fromDir, importPath); + + // Try with all supported extensions (derived from EXTENSION_MAP for consistency with indexing) + const extensions = getSupportedExtensions(); + for (const ext of extensions) { + try { + const fullPath = resolvedRaw + ext; + if (fs.existsSync(fullPath)) { + const stat = fs.statSync(fullPath); + if (stat.isFile()) { + return normalizePath(path.relative(projectRoot, fullPath)); + } + } + } catch { + // skip + } + } + // Try directory index files + for (const ext of extensions) { + try { + const indexPath = path.join(resolvedRaw, 'index' + ext); + if (fs.existsSync(indexPath)) { + return normalizePath(path.relative(projectRoot, indexPath)); + } + } catch { + // skip + } + } + return null; + } + + // Lazy-load extractFromSource (heavy tree-sitter import) + let extractFromSource: any = null; + let detectFromSource: any = null; + const grammarsLoaded = new Set(); + + while (currentDepth < maxDepth) { + const nextFrontier = new Set(); + + for (const file of frontier) { + const absFile = path.resolve(projectRoot, file); + + // Lazy-load the extraction module + if (!extractFromSource) { + // Use dynamic import with explicit path (Node 22.5+ bans bare directory imports) + const mod = await import('../extraction/index.js'); + extractFromSource = mod.extractFromSource; + detectFromSource = mod.detectLanguage; + } + + // Ensure grammar is loaded for the file's language + const lang = detectFromSource(file); + if (lang && !grammarsLoaded.has(lang)) { + const { loadGrammarsForLanguages } = await import('../extraction/index.js'); + await loadGrammarsForLanguages([lang]); + grammarsLoaded.add(lang); + } + + try { + const content = fs.readFileSync(absFile, 'utf-8'); + // Run full tree-sitter extraction for this single file + const extraction = extractFromSource(file, content); + + // Extract import module names from import nodes. + // For JS/TS these are relative paths like './utils'; for C/C++ these are + // bare include names like 'log/Log.hpp'. We resolve all of them against + // the filesystem — system headers and unresolvable includes simply won't + // be found (this is a known C++ limitation without compile_commands.json). + if (extraction && extraction.nodes) { + for (const node of extraction.nodes) { + if (node.kind === 'import' && node.name) { + const dep = resolveImportToDep(node.name, file); + if (dep && !allFiles.has(dep)) { + allFiles.add(dep); + nextFrontier.add(dep); + } + } + } + } + } catch { + // File extraction failed — skip this file + } + } + + if (nextFrontier.size === 0) break; + frontier = nextFrontier; + currentDepth++; + } + + // Apply glob filter to discovered files (not specified ones) + const filesToIndex: string[] = []; + for (const file of allFiles) { + if (!specifiedFiles.includes(file) && globFilter && !globFilter.test(file)) { + continue; + } + filesToIndex.push(file); + } + + if (!options.quiet) { + console.log(chalk.blue(`Dependency depth: ${currentDepth}`)); + console.log(chalk.blue(`Total files in subgraph: ${filesToIndex.length}`)); + } + + if (filesToIndex.length === 0) { + error('No files to index.'); + cg.destroy(); + process.exit(1); + } + + // 4. Index all files + if (!options.quiet) info(`Indexing ${filesToIndex.length} files...`); + + const indexResult = await cg.indexFiles(filesToIndex); + + if (!options.quiet) { + console.log(chalk.green(`Indexed: ${indexResult.filesIndexed} files, ${indexResult.nodesCreated} nodes`)); + if (indexResult.errors.length > 0) { + for (const e of indexResult.errors) { + console.log(chalk.yellow(` [${e.severity}] ${e.message}`)); + } + } + } + + // 5. Resolve references to create edges + if (!options.quiet) info('Resolving references...'); + + // Reinitialize resolver so it picks up newly indexed symbols + cg.reinitializeResolver(); + + // Use the scoped resolution: only resolve refs from our files + const resolutionResult = await cg.resolveReferencesScoped(filesToIndex); + + if (!options.quiet) { + console.log(chalk.green(`Resolved: ${resolutionResult.stats.resolved} / ${resolutionResult.stats.total} references`)); + } + + // 6. Output the subgraph + if (options.json) { + const stats = cg.getStats(); + const output = { + files: filesToIndex, + nodes: stats.nodeCount, + edges: stats.edgeCount, + specifiedFiles, + resolution: resolutionResult.stats, + }; + console.log(JSON.stringify(output, null, 2)); + } else if (options.quiet) { + // Only output file paths, one per line + console.log(filesToIndex.join('\n')); + } else { + console.log(chalk.bold('\nSubgraph summary:')); + console.log(chalk.gray(` Files: ${filesToIndex.length}`)); + console.log(chalk.gray(` Nodes: ${cg.getStats().nodeCount}`)); + console.log(chalk.gray(` Edges: ${cg.getStats().edgeCount}`)); + console.log(chalk.gray(` Specified: ${specifiedFiles.length}`)); + } + + cg.destroy(); + + // If we created a temporary index, clean it up + if (wasCreated) { + const { removeDirectory } = await import('../directory'); + removeDirectory(projectRoot); + } + + } catch (err) { + error(`Specify failed: ${err instanceof Error ? err.message : String(err)}`); + process.exit(1); + } + }); + /** * codegraph install */ diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts index 8f5d0965e..82c80b24e 100644 --- a/src/extraction/grammars.ts +++ b/src/extraction/grammars.ts @@ -104,6 +104,15 @@ export const EXTENSION_MAP: Record = { '.properties': 'properties', }; +/** + * All file extensions that CodeGraph treats as source files. + * Derived from EXTENSION_MAP so the `specify` command (and any other + * caller that needs an extension list) never drift from the indexing logic. + */ +export function getSupportedExtensions(): string[] { + return [...Object.keys(EXTENSION_MAP)]; +} + /** * Whether a file is one CodeGraph can parse, based purely on its extension. * This is the single source of truth for "should we index this file" — derived diff --git a/src/extraction/index.ts b/src/extraction/index.ts index db69d25ad..6fd0d4cd9 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -1094,6 +1094,14 @@ export class ExtractionOrchestrator { let totalNodes = 0; let totalEdges = 0; + // Ensure grammars are loaded for the files being indexed + const neededLanguages = [...new Set(filePaths.map((f) => detectLanguage(f)))]; + // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded when c is needed + if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) { + neededLanguages.push('cpp'); + } + await loadGrammarsForLanguages(neededLanguages); + for (const filePath of filePaths) { const result = await this.indexFile(filePath); @@ -1534,4 +1542,4 @@ export class ExtractionOrchestrator { // Re-export useful types and functions export { extractFromSource } from './tree-sitter'; -export { detectLanguage, isSourceFile, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars'; +export { detectLanguage, isSourceFile, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, getSupportedExtensions, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars'; diff --git a/src/index.ts b/src/index.ts index ee3bf51fa..4dc5679a4 100644 --- a/src/index.ts +++ b/src/index.ts @@ -605,6 +605,16 @@ export class CodeGraph { return this.resolver.resolveAndPersistBatched(onProgress); } + /** + * Resolve references scoped to specific files only. + * Faster than resolveReferencesBatched for small file sets since it + * loads unresolved refs only from the given file paths. + */ + async resolveReferencesScoped(filePaths: string[]): Promise { + const unresolvedRefs = this.queries.getUnresolvedReferencesByFiles(filePaths); + return this.resolver.resolveAndPersist(unresolvedRefs); + } + /** * Get detected frameworks in the project */