diff --git a/src/domain/graph/builder/stages/build-edges.ts b/src/domain/graph/builder/stages/build-edges.ts index e37ec7ac..4d452128 100644 --- a/src/domain/graph/builder/stages/build-edges.ts +++ b/src/domain/graph/builder/stages/build-edges.ts @@ -561,17 +561,82 @@ function buildClassHierarchyEdges( // ── Main entry point ──────────────────────────────────────────────────── +/** + * For small incremental builds (≤5 changed files on a large codebase), scope + * the node loading query to only files that are relevant: changed files + + * their import targets. Falls back to loading ALL nodes for full builds or + * larger incremental changes. + */ +function loadNodes(ctx: PipelineContext): { rows: QueryNodeRow[]; scoped: boolean } { + const { db, fileSymbols, isFullBuild, batchResolved } = ctx; + const nodeKindFilter = `kind IN ('function','method','class','interface','struct','type','module','enum','trait','record','constant')`; + + // Gate: only scope for small incremental on large codebases + if (!isFullBuild && fileSymbols.size <= 5) { + const existingFileCount = ( + db.prepare("SELECT COUNT(*) as c FROM nodes WHERE kind = 'file'").get() as { c: number } + ).c; + if (existingFileCount > 20) { + // Collect relevant files: changed files + their import targets + const relevantFiles = new Set(fileSymbols.keys()); + if (batchResolved) { + for (const resolvedPath of batchResolved.values()) { + relevantFiles.add(resolvedPath); + } + } + // Also add barrel-only files + for (const barrelPath of ctx.barrelOnlyFiles) { + relevantFiles.add(barrelPath); + } + + const placeholders = [...relevantFiles].map(() => '?').join(','); + const rows = db + .prepare( + `SELECT id, name, kind, file, line FROM nodes WHERE ${nodeKindFilter} AND file IN (${placeholders})`, + ) + .all(...relevantFiles) as QueryNodeRow[]; + return { rows, scoped: true }; + } + } + + const rows = db + .prepare(`SELECT id, name, kind, file, line FROM nodes WHERE ${nodeKindFilter}`) + .all() as QueryNodeRow[]; + return { rows, scoped: false }; +} + +/** + * For scoped node loading, patch nodesByName.get with a lazy SQL fallback + * so global name-only lookups (resolveByMethodOrGlobal, supplementReceiverEdges) + * can still find nodes outside the scoped set. + */ +function addLazyFallback(ctx: PipelineContext, scopedLoad: boolean): void { + if (!scopedLoad) return; + const { db } = ctx; + const fallbackStmt = db.prepare( + `SELECT id, name, kind, file, line FROM nodes WHERE name = ? AND kind != 'file'`, + ); + const originalGet = ctx.nodesByName.get.bind(ctx.nodesByName); + ctx.nodesByName.get = (name: string) => { + const result = originalGet(name); + if (result !== undefined) return result; + const rows = fallbackStmt.all(name) as unknown as NodeRow[]; + if (rows.length > 0) { + ctx.nodesByName.set(name, rows); + return rows; + } + return undefined; + }; +} + export async function buildEdges(ctx: PipelineContext): Promise { const { db, engineName } = ctx; const getNodeIdStmt = makeGetNodeIdStmt(db); - const allNodes = db - .prepare( - `SELECT id, name, kind, file, line FROM nodes WHERE kind IN ('function','method','class','interface','struct','type','module','enum','trait','record','constant')`, - ) - .all() as QueryNodeRow[]; - setupNodeLookups(ctx, allNodes); + const { rows: allNodesBefore, scoped: scopedLoad } = loadNodes(ctx); + setupNodeLookups(ctx, allNodesBefore); + addLazyFallback(ctx, scopedLoad); const t0 = performance.now(); const buildEdgesTx = db.transaction(() => { @@ -592,7 +657,7 @@ export async function buildEdges(ctx: PipelineContext): Promise { const native = engineName === 'native' ? loadNative() : null; if (native?.buildCallEdges) { - buildCallEdgesNative(ctx, getNodeIdStmt, allEdgeRows, allNodes, native); + buildCallEdgesNative(ctx, getNodeIdStmt, allEdgeRows, allNodesBefore, native); } else { buildCallEdgesJS(ctx, getNodeIdStmt, allEdgeRows); } diff --git a/src/domain/graph/builder/stages/collect-files.ts b/src/domain/graph/builder/stages/collect-files.ts index 6551b598..aaa658b5 100644 --- a/src/domain/graph/builder/stages/collect-files.ts +++ b/src/domain/graph/builder/stages/collect-files.ts @@ -2,14 +2,78 @@ * Stage: collectFiles * * Collects all source files to process. Handles both normal and scoped rebuilds. + * For incremental builds with a valid journal, reconstructs the file list from + * the DB's file_hashes table + journal deltas, skipping the filesystem scan. */ import fs from 'node:fs'; import path from 'node:path'; -import { info } from '../../../../infrastructure/logger.js'; +import { debug, info } from '../../../../infrastructure/logger.js'; import { normalizePath } from '../../../../shared/constants.js'; +import { readJournal } from '../../journal.js'; import type { PipelineContext } from '../context.js'; import { collectFiles as collectFilesUtil } from '../helpers.js'; +/** + * Reconstruct allFiles from DB file_hashes + journal deltas. + * Returns null when the fast path isn't applicable (first build, no journal, etc). + */ +function tryFastCollect( + ctx: PipelineContext, +): { files: string[]; directories: Set } | null { + const { db, rootDir } = ctx; + + // 1. Check that file_hashes table exists and has entries + let dbFileCount: number; + try { + dbFileCount = (db.prepare('SELECT COUNT(*) as c FROM file_hashes').get() as { c: number }).c; + } catch { + return null; + } + if (dbFileCount === 0) return null; + + // 2. Read the journal — only use fast path when journal has entries, + // proving the watcher was active and tracking changes. An empty-but-valid + // journal (no watcher) could miss file deletions. + const journal = readJournal(rootDir); + if (!journal.valid) return null; + const hasEntries = + (journal.changed && journal.changed.length > 0) || + (journal.removed && journal.removed.length > 0); + if (!hasEntries) return null; + + // 3. Load existing file list from file_hashes (relative paths) + const dbFiles = (db.prepare('SELECT file FROM file_hashes').all() as Array<{ file: string }>).map( + (r) => r.file, + ); + + // 4. Apply journal deltas: remove deleted files, add new/changed files + const fileSet = new Set(dbFiles); + if (journal.removed) { + for (const removed of journal.removed) { + fileSet.delete(removed); + } + } + if (journal.changed) { + for (const changed of journal.changed) { + fileSet.add(changed); + } + } + + // 5. Convert to absolute paths and compute directories + const files: string[] = []; + const directories = new Set(); + for (const relPath of fileSet) { + const absPath = path.join(rootDir, relPath); + files.push(absPath); + directories.add(path.dirname(absPath)); + } + + debug( + `collectFiles fast path: ${dbFiles.length} from DB, journal: +${journal.changed?.length ?? 0}/-${journal.removed?.length ?? 0} → ${files.length} files`, + ); + return { files, directories }; +} + export async function collectFiles(ctx: PipelineContext): Promise { const { rootDir, config, opts } = ctx; @@ -33,10 +97,23 @@ export async function collectFiles(ctx: PipelineContext): Promise { ctx.removed = missing; ctx.isFullBuild = false; info(`Scoped rebuild: ${existing.length} files to rebuild, ${missing.length} to purge`); - } else { - const collected = collectFilesUtil(rootDir, [], config, new Set()); - ctx.allFiles = collected.files; - ctx.discoveredDirs = collected.directories; - info(`Found ${ctx.allFiles.length} files to parse`); + return; } + + // Incremental fast path: reconstruct file list from DB + journal deltas + // instead of full recursive filesystem scan (~8ms savings on 473 files). + if (ctx.incremental && !ctx.forceFullRebuild) { + const fast = tryFastCollect(ctx); + if (fast) { + ctx.allFiles = fast.files; + ctx.discoveredDirs = fast.directories; + info(`Found ${ctx.allFiles.length} files (cached)`); + return; + } + } + + const collected = collectFilesUtil(rootDir, [], config, new Set()); + ctx.allFiles = collected.files; + ctx.discoveredDirs = collected.directories; + info(`Found ${ctx.allFiles.length} files to parse`); } diff --git a/src/domain/graph/builder/stages/finalize.ts b/src/domain/graph/builder/stages/finalize.ts index 0a0f5b97..099f7642 100644 --- a/src/domain/graph/builder/stages/finalize.ts +++ b/src/domain/graph/builder/stages/finalize.ts @@ -67,7 +67,9 @@ export async function finalize(ctx: PipelineContext): Promise { // built_at is only used by stale-embeddings check (skipped for incremental), // and counts are only used by drift detection (skipped for ≤3 files). // This avoids a transaction commit + WAL fsync (~15-30ms). - if (isFullBuild || allSymbols.size > 5) { + // Threshold aligned with drift detection gate (allSymbols.size > 3) so stored + // counts stay fresh whenever drift detection reads them. + if (isFullBuild || allSymbols.size > 3) { try { setBuildMeta(db, { engine: ctx.engineName, @@ -157,6 +159,10 @@ export async function finalize(ctx: PipelineContext): Promise { } } + // Intentionally measured before closeDb / writeJournalHeader / auto-registration: + // for the deferred-close path the close is async (setImmediate), and for full + // builds the metric captures finalize logic only — DB close cost is tracked + // separately via timing.closeDbMs when available. ctx.timing.finalizeMs = performance.now() - t0; // For small incremental builds, defer db.close() to the next event loop tick. @@ -177,7 +183,6 @@ export async function finalize(ctx: PipelineContext): Promise { // registered during the initial full build. The dynamic import + file I/O // costs ~100ms which dominates incremental finalize time. if (!opts.skipRegistry && isFullBuild) { - const { tmpdir } = await import('node:os'); const tmpDir = path.resolve(tmpdir()); const resolvedRoot = path.resolve(rootDir); if (resolvedRoot.startsWith(tmpDir)) {