diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index 1416631a..72a8ca20 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -15,6 +15,7 @@ import { initSchema, MIGRATIONS, openDb, + purgeFilesData, releaseAdvisoryLock, setBuildMeta, } from '../../../db/index.js'; @@ -38,6 +39,7 @@ import { formatDropExtensionSummary, getActiveEngine, getInstalledWasmExtensions, + NATIVE_SUPPORTED_EXTENSIONS, parseFilesWasmForBackfill, } from '../../parser.js'; import { writeJournalHeader } from '../journal.js'; @@ -659,11 +661,12 @@ async function tryNativeOrchestrator( if (result.earlyExit) { info('No changes detected'); // Even on no-op rebuilds, dropped-language files added since the last - // full build are still missing from `nodes`/`file_hashes` (#1083). The - // orchestrator's file_collector skipped them, so its earlyExit doesn't - // imply DB consistency. Run the gap repair before returning. + // full build are still missing from `nodes`/`file_hashes` (#1083), and + // WASM-only files deleted from disk leave stale rows behind (#1073). + // The orchestrator's file_collector skipped them, so its earlyExit + // doesn't imply DB consistency. Run the gap repair before returning. const gap = detectDroppedLanguageGap(ctx); - if (gap.missingAbs.length > 0) { + if (gap.missingAbs.length > 0 || gap.staleRel.length > 0) { await backfillNativeDroppedFiles(ctx, gap); } closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); @@ -773,7 +776,13 @@ async function tryNativeOrchestrator( const removedCount = result.removedCount ?? 0; const changedCount = result.changedCount ?? 0; const gap = detectDroppedLanguageGap(ctx); - if (result.isFullBuild || removedCount > 0 || changedCount > 0 || gap.missingAbs.length > 0) { + if ( + result.isFullBuild || + removedCount > 0 || + changedCount > 0 || + gap.missingAbs.length > 0 || + gap.staleRel.length > 0 + ) { await backfillNativeDroppedFiles(ctx, gap); } @@ -787,6 +796,101 @@ interface DroppedLanguageGap { missingRel: string[]; /** Absolute paths, aligned by index with `missingRel`. */ missingAbs: string[]; + /** + * Relative paths of WASM-only files present in DB but absent from disk (#1073). + * Rust's `detect_removed_files` filter (#1070) skips these, so the JS-side + * backfill must purge them. Always disjoint from `missingRel`. + */ + staleRel: string[]; +} + +/** + * Inputs to {@link computeWasmOnlyStaleFiles}. Sets are passed in so the helper + * is pure and unit-testable independently of `getInstalledWasmExtensions` and + * the `NATIVE_SUPPORTED_EXTENSIONS` global state. + */ +export interface WasmOnlyStaleFilesInput { + /** Distinct `file` values from the `nodes` table. */ + existingNodes: ReadonlySet; + /** Distinct `file` values from the `file_hashes` table. */ + existingHashes: ReadonlySet; + /** Relative paths currently on disk (from `collectFilesUtil`). */ + expected: ReadonlySet; + /** Lowercased extensions whose WASM grammar is installed. */ + installedExts: ReadonlySet; + /** Extensions covered by the Rust addon — Rust owns deletion for these. */ + nativeSupported: ReadonlySet; +} + +/** + * Compute the WASM-only files present in the DB but missing from disk (#1073). + * + * Returns relative paths that: + * - appear in `existingNodes` or `existingHashes` (in DB), + * - are absent from `expected` (not on disk), + * - have an extension installed for WASM, AND + * - have an extension NOT covered by `nativeSupported` — Rust's + * `purge_changed_files` handles deletion for natively-supported extensions + * via its own `detect_removed_files`, so the caller must not double-purge. + * + * Extensions are lowercased before lookup to match the registry and Rust's + * `LanguageKind::from_extension` (which normalises case for the languages + * where both cases are conventional, e.g. R's `.r` / `.R`). + * + * DB paths are forced to forward slashes before comparison with `expected` + * (which is always normalised). The on-disk invariant is that DB rows are + * written with forward slashes, but a stale row written by older code on + * Windows could carry back-slashes — normalising here makes the comparison + * platform-safe and prevents false-positive purges of live rows. We replace + * `\\` explicitly (rather than calling `normalizePath`, which only touches + * `path.sep`) so the defence works when running on POSIX against a DB that + * was migrated from Windows. + * + * Exported for unit testing. + */ +export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): string[] { + const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input; + const stale: string[] = []; + const seen = new Set(); + const consider = (rawRel: string): void => { + const rel = rawRel.replace(/\\/g, '/'); + if (expected.has(rel) || seen.has(rel)) return; + const ext = path.extname(rel).toLowerCase(); + if (nativeSupported.has(ext)) return; + if (!installedExts.has(ext)) return; + seen.add(rel); + // Push the ORIGINAL raw path (not the normalised form) so the eventual + // `DELETE FROM nodes WHERE file = ?` predicate in `purgeFilesData` + // matches the actual stored row. The dedup `seen` set keeps the + // normalised form so a file written once with `\` and once with `/` + // is still treated as one entry — but the value the SQL sees has to + // be byte-identical to what's on disk in the DB. + stale.push(rawRel); + }; + for (const rel of existingNodes) consider(rel); + for (const rel of existingHashes) consider(rel); + return stale; +} + +/** + * Group relative paths by their lowercased extension. Shape matches the bucket + * type that `formatDropExtensionSummary` consumes, so callers can render a + * log-friendly per-extension summary without going through `classifyNativeDrops` + * when the reason is already known (e.g. the stale-purge path where every path + * is guaranteed `unsupported-by-native`). + */ +function groupByExtension(relPaths: Iterable): Map { + const buckets = new Map(); + for (const rel of relPaths) { + const ext = path.extname(rel).toLowerCase(); + let list = buckets.get(ext); + if (!list) { + list = []; + buckets.set(ext, list); + } + list.push(rel); + } + return buckets; } /** @@ -803,6 +907,13 @@ interface DroppedLanguageGap { * regression — excluding them keeps the warn count in * `backfillNativeDroppedFiles` meaningful. * + * Also detects WASM-only files deleted from disk (#1073). Rust's + * `detect_removed_files` filter (#1070) skips files outside its supported + * extensions, so deletions of WASM-only languages don't reach the native + * purge path; the rest of the backfill only inserts rows, so without this + * step stale `nodes`/`file_hashes` rows would linger across incremental + * rebuilds until the next full rebuild. + * * Cheap (no DB handoff, no parsing): used both to gate the backfill call * and as its working set. NativeDbProxy supports `.prepare().all()`, so * this works whether `ctx.db` is a proxy or a real better-sqlite3 @@ -844,13 +955,25 @@ function detectDroppedLanguageGap(ctx: PipelineContext): DroppedLanguageGap { missingRel.push(rel); missingAbs.push(path.join(ctx.rootDir, rel)); } - return { missingRel, missingAbs }; + + const staleRel = computeWasmOnlyStaleFiles({ + existingNodes, + existingHashes, + expected, + installedExts, + nativeSupported: NATIVE_SUPPORTED_EXTENSIONS, + }); + + return { missingRel, missingAbs, staleRel }; } /** * Backfill files that the native orchestrator silently dropped during parse. * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). * + * Also purges stale rows for WASM-only files deleted from disk (#1073), which + * Rust's `detect_removed_files` filter (#1070) skips. + * * Accepts a pre-computed `gap` from `detectDroppedLanguageGap` so the caller * can use the same scan for both gating and the actual backfill — avoiding * a redundant fs walk when the orchestrator's signals already triggered. @@ -859,8 +982,8 @@ async function backfillNativeDroppedFiles( ctx: PipelineContext, gap: DroppedLanguageGap, ): Promise { - const { missingRel, missingAbs } = gap; - if (missingAbs.length === 0) return; + const { missingRel, missingAbs, staleRel } = gap; + if (missingAbs.length === 0 && staleRel.length === 0) return; // Now that we know there's work to do, hand off to better-sqlite3 (needed // for the INSERT path below). @@ -870,6 +993,28 @@ async function backfillNativeDroppedFiles( ctx.nativeFirstProxy = false; } + const dbConn = ctx.db as unknown as BetterSqlite3Database; + + // Purge WASM-only files that were deleted from disk (#1073). Rust's + // detect_removed_files skips them and the insert path below never visits + // them, so without this their rows would persist across rebuilds until the + // next full rebuild reset the DB. + if (staleRel.length > 0) { + // `computeWasmOnlyStaleFiles` guarantees every path here has an extension + // outside NATIVE_SUPPORTED_EXTENSIONS, so `classifyNativeDrops` would + // always bucket 100% into `unsupported-by-native`. Build the extension + // summary directly to avoid a redundant classification pass. + const staleByExt = groupByExtension(staleRel); + info( + `Detected ${staleRel.length} deleted WASM-only file(s) the native orchestrator skipped; purging stale rows: ${formatDropExtensionSummary(staleByExt)}`, + ); + purgeFilesData(dbConn, staleRel); + } + + if (missingAbs.length === 0) return; + + if (missingAbs.length === 0) return; + // Classify drops so users see per-extension reasons instead of just a count // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust // extractor); `native-extractor-failure` indicates a real native bug since @@ -918,7 +1063,7 @@ async function backfillNativeDroppedFiles( exportKeys.push([exp.name, exp.kind, relPath, exp.line]); } } - const db = ctx.db as unknown as BetterSqlite3Database; + const db = dbConn; batchInsertNodes(db, rows); // Mark exported symbols in batches — mirrors insertDefinitionsAndExports. diff --git a/tests/builder/wasm-only-stale-files.test.ts b/tests/builder/wasm-only-stale-files.test.ts new file mode 100644 index 00000000..1ca4569a --- /dev/null +++ b/tests/builder/wasm-only-stale-files.test.ts @@ -0,0 +1,148 @@ +/** + * Unit tests for `computeWasmOnlyStaleFiles` (#1073). + * + * The Rust orchestrator's `detect_removed_files` filter (#1070) skips files + * outside its supported extensions, so deletions of WASM-only languages don't + * reach the native purge path. The JS-side backfill only inserts rows, so + * without this helper a deleted WASM-only file would leak `nodes`/`file_hashes` + * rows until the next full rebuild. + * + * These tests pass the extension sets as parameters so they remain meaningful + * even when every currently-registered language is natively supported + * (i.e. `installedExts == nativeSupported`). The bug surface re-opens any time + * a new WASM-only language enters the registry before its Rust extractor. + */ +import { describe, expect, it } from 'vitest'; +import { computeWasmOnlyStaleFiles } from '../../src/domain/graph/builder/pipeline.js'; + +const NATIVE = new Set(['.ts', '.js', '.r']); +const INSTALLED = new Set(['.ts', '.js', '.r', '.gleam', '.foo']); + +describe('computeWasmOnlyStaleFiles', () => { + it('returns WASM-only files present in DB but missing from disk', () => { + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/a.gleam', 'src/b.ts']), + existingHashes: new Set(['src/a.gleam', 'src/b.ts']), + expected: new Set(['src/b.ts']), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual(['src/a.gleam']); + }); + + it('skips natively-supported extensions — Rust owns their deletion path', () => { + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/old.ts', 'src/old.r']), + existingHashes: new Set(['src/old.ts', 'src/old.r']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual([]); + }); + + it('skips extensions with no installed WASM grammar', () => { + // .bar is not in installedExts — neither engine can parse it, so DB rows + // for it can't have been written by this codepath. Leave them alone. + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/x.bar']), + existingHashes: new Set(['src/x.bar']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual([]); + }); + + it('catches files that exist only in file_hashes (nodes missing)', () => { + // Legacy DB shape where file_hashes was written but `nodes` was not — the + // backfill should still recognise the file_hashes row as stale. + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(), + existingHashes: new Set(['src/leftover.gleam']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual(['src/leftover.gleam']); + }); + + it('catches files that exist only in nodes (file_hashes missing)', () => { + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/leftover.gleam']), + existingHashes: new Set(), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual(['src/leftover.gleam']); + }); + + it('deduplicates files appearing in both nodes and file_hashes', () => { + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/dup.gleam']), + existingHashes: new Set(['src/dup.gleam']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual(['src/dup.gleam']); + }); + + it('lowercases extensions to match registry/Rust normalisation', () => { + // R is conventionally written `.R` on disk. The registry and the Rust + // `LanguageKind::from_extension` accept both cases; `installedExts` and + // `nativeSupported` carry the lowercase canonical form. + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/Plot.R']), + existingHashes: new Set(['src/Plot.R']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + // .R lowercases to .r which IS native-supported, so it should be skipped. + expect(stale).toEqual([]); + }); + + it('returns empty when DB and disk agree', () => { + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/a.gleam', 'src/b.ts']), + existingHashes: new Set(['src/a.gleam', 'src/b.ts']), + expected: new Set(['src/a.gleam', 'src/b.ts']), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual([]); + }); + + it('normalises DB paths with back-slashes against forward-slash expected set', () => { + // Defends against false-positive purges on Windows where a stale DB row + // (written by older code) could carry back-slashes while `expected` is + // always normalised. Without `normalizePath` inside `consider`, the file + // would look stale and be purged even though it exists on disk. + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src\\live.gleam']), + existingHashes: new Set(['src\\live.gleam']), + expected: new Set(['src/live.gleam']), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual([]); + }); + + it('preserves back-slash form so DELETE matches the actual DB row', () => { + // Counterpart to the previous test: when a back-slash DB row is GENUINELY + // stale (file no longer on disk), the returned path must keep its raw form + // so `purgeFilesData`'s `DELETE FROM nodes WHERE file = ?` matches the + // stored row. Pushing the forward-slash-normalised form would let the + // stale row silently persist — exactly the regression #1073 fixes. + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src\\dead.gleam']), + existingHashes: new Set(['src\\dead.gleam']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual(['src\\dead.gleam']); + }); +});