From f26abeeada5185cdc715ec1236ff10218711f9df Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Thu, 14 May 2026 15:57:10 -0600 Subject: [PATCH 1/3] fix(native): purge stale rows when WASM-only files are deleted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #1073 When a file in a WASM-only language is deleted from disk, neither engine cleans up its DB rows: Rust's detect_removed_files (#1070) skips files outside is_supported_extension, and the JS-side backfill only inserts. Add computeWasmOnlyStaleFiles to detect (existingNodes ∪ existingHashes) not on disk, filtered to extensions installed for WASM but absent from NATIVE_SUPPORTED_EXTENSIONS so Rust still owns its own purge path. Wire it into backfillNativeDroppedFiles to call purgeFilesData after the better-sqlite3 handoff. Unit tests cover the helper. docs check acknowledged — internal pipeline fix, no doc-visible surface. --- src/domain/graph/builder/pipeline.ts | 96 +++++++++++++++- tests/builder/wasm-only-stale-files.test.ts | 117 ++++++++++++++++++++ 2 files changed, 211 insertions(+), 2 deletions(-) create mode 100644 tests/builder/wasm-only-stale-files.test.ts diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index f31afa77..b31ef9f6 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -15,6 +15,7 @@ import { initSchema, MIGRATIONS, openDb, + purgeFilesData, releaseAdvisoryLock, setBuildMeta, } from '../../../db/index.js'; @@ -38,6 +39,7 @@ import { formatDropExtensionSummary, getActiveEngine, getInstalledWasmExtensions, + NATIVE_SUPPORTED_EXTENSIONS, parseFilesWasmForBackfill, } from '../../parser.js'; import { writeJournalHeader } from '../journal.js'; @@ -774,6 +776,58 @@ async function tryNativeOrchestrator( return formatNativeTimingResult(p, structurePatchMs, analysisTiming); } +/** + * Inputs to {@link computeWasmOnlyStaleFiles}. Sets are passed in so the helper + * is pure and unit-testable independently of `getInstalledWasmExtensions` and + * the `NATIVE_SUPPORTED_EXTENSIONS` global state. + */ +export interface WasmOnlyStaleFilesInput { + /** Distinct `file` values from the `nodes` table. */ + existingNodes: ReadonlySet; + /** Distinct `file` values from the `file_hashes` table. */ + existingHashes: ReadonlySet; + /** Relative paths currently on disk (from `collectFilesUtil`). */ + expected: ReadonlySet; + /** Lowercased extensions whose WASM grammar is installed. */ + installedExts: ReadonlySet; + /** Extensions covered by the Rust addon — Rust owns deletion for these. */ + nativeSupported: ReadonlySet; +} + +/** + * Compute the WASM-only files present in the DB but missing from disk (#1073). + * + * Returns relative paths that: + * - appear in `existingNodes` or `existingHashes` (in DB), + * - are absent from `expected` (not on disk), + * - have an extension installed for WASM, AND + * - have an extension NOT covered by `nativeSupported` — Rust's + * `purge_changed_files` handles deletion for natively-supported extensions + * via its own `detect_removed_files`, so the caller must not double-purge. + * + * Extensions are lowercased before lookup to match the registry and Rust's + * `LanguageKind::from_extension` (which normalises case for the languages + * where both cases are conventional, e.g. R's `.r` / `.R`). + * + * Exported for unit testing. + */ +export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): string[] { + const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input; + const stale: string[] = []; + const seen = new Set(); + const consider = (rel: string): void => { + if (expected.has(rel) || seen.has(rel)) return; + const ext = path.extname(rel).toLowerCase(); + if (nativeSupported.has(ext)) return; + if (!installedExts.has(ext)) return; + seen.add(rel); + stale.push(rel); + }; + for (const rel of existingNodes) consider(rel); + for (const rel of existingHashes) consider(rel); + return stale; +} + /** * Backfill files that the native orchestrator silently dropped during parse. * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). @@ -830,7 +884,22 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { missingRel.push(rel); missingAbs.push(path.join(ctx.rootDir, rel)); } - if (missingAbs.length === 0) return; + + // Detect WASM-only files deleted from disk (#1073). The Rust orchestrator's + // `detect_removed_files` filter (#1070) skips files outside its supported + // extensions, so deletions of WASM-only languages don't reach the native + // purge path. JS doesn't notice either — the rest of this function only + // inserts rows. Result: stale `nodes`/`file_hashes` rows linger across + // incremental rebuilds until the next full rebuild. + const staleRel = computeWasmOnlyStaleFiles({ + existingNodes, + existingHashes, + expected, + installedExts, + nativeSupported: NATIVE_SUPPORTED_EXTENSIONS, + }); + + if (missingAbs.length === 0 && staleRel.length === 0) return; // Now that we know there's work to do, hand off to better-sqlite3 (needed // for the INSERT path below). @@ -840,6 +909,29 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { ctx.nativeFirstProxy = false; } + const dbConn = ctx.db as unknown as BetterSqlite3Database; + + // Purge WASM-only files that were deleted from disk (#1073). Rust's + // detect_removed_files skips them and the insert path below never visits + // them, so without this their rows would persist across rebuilds until the + // next full rebuild reset the DB. + if (staleRel.length > 0) { + const { byReason: staleByReason, totals: staleTotals } = classifyNativeDrops(staleRel); + info( + `Detected ${staleRel.length} deleted WASM-only file(s) the native orchestrator skipped; purging stale rows: ${formatDropExtensionSummary(staleByReason['unsupported-by-native'])}`, + ); + // staleRel is restricted above to extensions outside NATIVE_SUPPORTED_EXTENSIONS, + // so the native-extractor-failure bucket should always be empty here. + if (staleTotals['native-extractor-failure'] > 0) { + debug( + `backfillNativeDroppedFiles: stale-purge classified ${staleTotals['native-extractor-failure']} native-supported file(s) — unexpected; inspect the filter`, + ); + } + purgeFilesData(dbConn, staleRel); + } + + if (missingAbs.length === 0) return; + // Classify drops so users see per-extension reasons instead of just a count // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust // extractor); `native-extractor-failure` indicates a real native bug since @@ -888,7 +980,7 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { exportKeys.push([exp.name, exp.kind, relPath, exp.line]); } } - const db = ctx.db as unknown as BetterSqlite3Database; + const db = dbConn; batchInsertNodes(db, rows); // Mark exported symbols in batches — mirrors insertDefinitionsAndExports. diff --git a/tests/builder/wasm-only-stale-files.test.ts b/tests/builder/wasm-only-stale-files.test.ts new file mode 100644 index 00000000..8a8d69fb --- /dev/null +++ b/tests/builder/wasm-only-stale-files.test.ts @@ -0,0 +1,117 @@ +/** + * Unit tests for `computeWasmOnlyStaleFiles` (#1073). + * + * The Rust orchestrator's `detect_removed_files` filter (#1070) skips files + * outside its supported extensions, so deletions of WASM-only languages don't + * reach the native purge path. The JS-side backfill only inserts rows, so + * without this helper a deleted WASM-only file would leak `nodes`/`file_hashes` + * rows until the next full rebuild. + * + * These tests pass the extension sets as parameters so they remain meaningful + * even when every currently-registered language is natively supported + * (i.e. `installedExts == nativeSupported`). The bug surface re-opens any time + * a new WASM-only language enters the registry before its Rust extractor. + */ +import { describe, expect, it } from 'vitest'; +import { computeWasmOnlyStaleFiles } from '../../src/domain/graph/builder/pipeline.js'; + +const NATIVE = new Set(['.ts', '.js', '.r']); +const INSTALLED = new Set(['.ts', '.js', '.r', '.gleam', '.foo']); + +describe('computeWasmOnlyStaleFiles', () => { + it('returns WASM-only files present in DB but missing from disk', () => { + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/a.gleam', 'src/b.ts']), + existingHashes: new Set(['src/a.gleam', 'src/b.ts']), + expected: new Set(['src/b.ts']), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual(['src/a.gleam']); + }); + + it('skips natively-supported extensions — Rust owns their deletion path', () => { + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/old.ts', 'src/old.r']), + existingHashes: new Set(['src/old.ts', 'src/old.r']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual([]); + }); + + it('skips extensions with no installed WASM grammar', () => { + // .bar is not in installedExts — neither engine can parse it, so DB rows + // for it can't have been written by this codepath. Leave them alone. + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/x.bar']), + existingHashes: new Set(['src/x.bar']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual([]); + }); + + it('catches files that exist only in file_hashes (nodes missing)', () => { + // Legacy DB shape where file_hashes was written but `nodes` was not — the + // backfill should still recognise the file_hashes row as stale. + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(), + existingHashes: new Set(['src/leftover.gleam']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual(['src/leftover.gleam']); + }); + + it('catches files that exist only in nodes (file_hashes missing)', () => { + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/leftover.gleam']), + existingHashes: new Set(), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual(['src/leftover.gleam']); + }); + + it('deduplicates files appearing in both nodes and file_hashes', () => { + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/dup.gleam']), + existingHashes: new Set(['src/dup.gleam']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual(['src/dup.gleam']); + }); + + it('lowercases extensions to match registry/Rust normalisation', () => { + // R is conventionally written `.R` on disk. The registry and the Rust + // `LanguageKind::from_extension` accept both cases; `installedExts` and + // `nativeSupported` carry the lowercase canonical form. + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/Plot.R']), + existingHashes: new Set(['src/Plot.R']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + // .R lowercases to .r which IS native-supported, so it should be skipped. + expect(stale).toEqual([]); + }); + + it('returns empty when DB and disk agree', () => { + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src/a.gleam', 'src/b.ts']), + existingHashes: new Set(['src/a.gleam', 'src/b.ts']), + expected: new Set(['src/a.gleam', 'src/b.ts']), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual([]); + }); +}); From 8af09cc428b1ebc6b28a5cd7d171abd39f59b670 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Thu, 14 May 2026 23:17:35 -0600 Subject: [PATCH 2/3] refactor(builder): drop redundant classify pass and harden DB path matching (#1122) Address Greptile review on the WASM-only stale purge: - Replace classifyNativeDrops + unreachable native-extractor-failure guard in the stale-purge branch with a direct groupByExtension helper. The paths returned by computeWasmOnlyStaleFiles are guaranteed to be outside NATIVE_SUPPORTED_EXTENSIONS, so the classification pass was always a no-op for the unreachable bucket. - Normalise back-slashes to forward-slashes inside consider() before comparing against the expected set. Defends against a stale DB row (e.g. migrated from a Windows-built DB) being treated as missing and purged even when the file still exists on disk. Replace `\\` explicitly rather than calling normalizePath so the defence works on POSIX too. - Add a regression test for the back-slash path-matching case. --- src/domain/graph/builder/pipeline.ts | 48 ++++++++++++++++----- tests/builder/wasm-only-stale-files.test.ts | 15 +++++++ 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index b31ef9f6..327e8f13 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -809,13 +809,23 @@ export interface WasmOnlyStaleFilesInput { * `LanguageKind::from_extension` (which normalises case for the languages * where both cases are conventional, e.g. R's `.r` / `.R`). * + * DB paths are forced to forward slashes before comparison with `expected` + * (which is always normalised). The on-disk invariant is that DB rows are + * written with forward slashes, but a stale row written by older code on + * Windows could carry back-slashes — normalising here makes the comparison + * platform-safe and prevents false-positive purges of live rows. We replace + * `\\` explicitly (rather than calling `normalizePath`, which only touches + * `path.sep`) so the defence works when running on POSIX against a DB that + * was migrated from Windows. + * * Exported for unit testing. */ export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): string[] { const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input; const stale: string[] = []; const seen = new Set(); - const consider = (rel: string): void => { + const consider = (rawRel: string): void => { + const rel = rawRel.replace(/\\/g, '/'); if (expected.has(rel) || seen.has(rel)) return; const ext = path.extname(rel).toLowerCase(); if (nativeSupported.has(ext)) return; @@ -828,6 +838,27 @@ export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): strin return stale; } +/** + * Group relative paths by their lowercased extension. Shape matches the bucket + * type that `formatDropExtensionSummary` consumes, so callers can render a + * log-friendly per-extension summary without going through `classifyNativeDrops` + * when the reason is already known (e.g. the stale-purge path where every path + * is guaranteed `unsupported-by-native`). + */ +function groupByExtension(relPaths: Iterable): Map { + const buckets = new Map(); + for (const rel of relPaths) { + const ext = path.extname(rel).toLowerCase(); + let list = buckets.get(ext); + if (!list) { + list = []; + buckets.set(ext, list); + } + list.push(rel); + } + return buckets; +} + /** * Backfill files that the native orchestrator silently dropped during parse. * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). @@ -916,17 +947,14 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { // them, so without this their rows would persist across rebuilds until the // next full rebuild reset the DB. if (staleRel.length > 0) { - const { byReason: staleByReason, totals: staleTotals } = classifyNativeDrops(staleRel); + // `computeWasmOnlyStaleFiles` guarantees every path here has an extension + // outside NATIVE_SUPPORTED_EXTENSIONS, so `classifyNativeDrops` would + // always bucket 100% into `unsupported-by-native`. Build the extension + // summary directly to avoid a redundant classification pass. + const staleByExt = groupByExtension(staleRel); info( - `Detected ${staleRel.length} deleted WASM-only file(s) the native orchestrator skipped; purging stale rows: ${formatDropExtensionSummary(staleByReason['unsupported-by-native'])}`, + `Detected ${staleRel.length} deleted WASM-only file(s) the native orchestrator skipped; purging stale rows: ${formatDropExtensionSummary(staleByExt)}`, ); - // staleRel is restricted above to extensions outside NATIVE_SUPPORTED_EXTENSIONS, - // so the native-extractor-failure bucket should always be empty here. - if (staleTotals['native-extractor-failure'] > 0) { - debug( - `backfillNativeDroppedFiles: stale-purge classified ${staleTotals['native-extractor-failure']} native-supported file(s) — unexpected; inspect the filter`, - ); - } purgeFilesData(dbConn, staleRel); } diff --git a/tests/builder/wasm-only-stale-files.test.ts b/tests/builder/wasm-only-stale-files.test.ts index 8a8d69fb..eef633b1 100644 --- a/tests/builder/wasm-only-stale-files.test.ts +++ b/tests/builder/wasm-only-stale-files.test.ts @@ -114,4 +114,19 @@ describe('computeWasmOnlyStaleFiles', () => { }); expect(stale).toEqual([]); }); + + it('normalises DB paths with back-slashes against forward-slash expected set', () => { + // Defends against false-positive purges on Windows where a stale DB row + // (written by older code) could carry back-slashes while `expected` is + // always normalised. Without `normalizePath` inside `consider`, the file + // would look stale and be purged even though it exists on disk. + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src\\live.gleam']), + existingHashes: new Set(['src\\live.gleam']), + expected: new Set(['src/live.gleam']), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual([]); + }); }); From 4c22410b711d8fe028f675f899e8e05997de1f81 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Fri, 15 May 2026 02:08:14 -0600 Subject: [PATCH 3/3] fix(stale-purge): preserve raw path so DELETE matches DB row (#1122) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Greptile P1: pushing the forward-slash-normalised path into the stale list meant 'DELETE FROM nodes WHERE file = ?' missed rows that had been written with back-slashes (e.g. legacy Windows-migrated DBs) — the exact regression #1073 is trying to fix. The dedup key keeps the normalised form so a file written once with '\' and once with '/' is still treated as one entry, but the value the SQL sees is now byte-identical to what's stored. Added a regression test covering a back-slash row with an empty 'expected' set. --- src/domain/graph/builder/pipeline.ts | 8 +++++++- tests/builder/wasm-only-stale-files.test.ts | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index cb6025c2..72a8ca20 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -859,7 +859,13 @@ export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): strin if (nativeSupported.has(ext)) return; if (!installedExts.has(ext)) return; seen.add(rel); - stale.push(rel); + // Push the ORIGINAL raw path (not the normalised form) so the eventual + // `DELETE FROM nodes WHERE file = ?` predicate in `purgeFilesData` + // matches the actual stored row. The dedup `seen` set keeps the + // normalised form so a file written once with `\` and once with `/` + // is still treated as one entry — but the value the SQL sees has to + // be byte-identical to what's on disk in the DB. + stale.push(rawRel); }; for (const rel of existingNodes) consider(rel); for (const rel of existingHashes) consider(rel); diff --git a/tests/builder/wasm-only-stale-files.test.ts b/tests/builder/wasm-only-stale-files.test.ts index eef633b1..1ca4569a 100644 --- a/tests/builder/wasm-only-stale-files.test.ts +++ b/tests/builder/wasm-only-stale-files.test.ts @@ -129,4 +129,20 @@ describe('computeWasmOnlyStaleFiles', () => { }); expect(stale).toEqual([]); }); + + it('preserves back-slash form so DELETE matches the actual DB row', () => { + // Counterpart to the previous test: when a back-slash DB row is GENUINELY + // stale (file no longer on disk), the returned path must keep its raw form + // so `purgeFilesData`'s `DELETE FROM nodes WHERE file = ?` matches the + // stored row. Pushing the forward-slash-normalised form would let the + // stale row silently persist — exactly the regression #1073 fixes. + const stale = computeWasmOnlyStaleFiles({ + existingNodes: new Set(['src\\dead.gleam']), + existingHashes: new Set(['src\\dead.gleam']), + expected: new Set(), + installedExts: INSTALLED, + nativeSupported: NATIVE, + }); + expect(stale).toEqual(['src\\dead.gleam']); + }); });