Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 154 additions & 9 deletions src/domain/graph/builder/pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
initSchema,
MIGRATIONS,
openDb,
purgeFilesData,
releaseAdvisoryLock,
setBuildMeta,
} from '../../../db/index.js';
Expand All @@ -38,6 +39,7 @@ import {
formatDropExtensionSummary,
getActiveEngine,
getInstalledWasmExtensions,
NATIVE_SUPPORTED_EXTENSIONS,
parseFilesWasmForBackfill,
} from '../../parser.js';
import { writeJournalHeader } from '../journal.js';
Expand Down Expand Up @@ -659,11 +661,12 @@ async function tryNativeOrchestrator(
if (result.earlyExit) {
info('No changes detected');
// Even on no-op rebuilds, dropped-language files added since the last
// full build are still missing from `nodes`/`file_hashes` (#1083). The
// orchestrator's file_collector skipped them, so its earlyExit doesn't
// imply DB consistency. Run the gap repair before returning.
// full build are still missing from `nodes`/`file_hashes` (#1083), and
// WASM-only files deleted from disk leave stale rows behind (#1073).
// The orchestrator's file_collector skipped them, so its earlyExit
// doesn't imply DB consistency. Run the gap repair before returning.
const gap = detectDroppedLanguageGap(ctx);
if (gap.missingAbs.length > 0) {
if (gap.missingAbs.length > 0 || gap.staleRel.length > 0) {
await backfillNativeDroppedFiles(ctx, gap);
}
closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb });
Expand Down Expand Up @@ -773,7 +776,13 @@ async function tryNativeOrchestrator(
const removedCount = result.removedCount ?? 0;
const changedCount = result.changedCount ?? 0;
const gap = detectDroppedLanguageGap(ctx);
if (result.isFullBuild || removedCount > 0 || changedCount > 0 || gap.missingAbs.length > 0) {
if (
result.isFullBuild ||
removedCount > 0 ||
changedCount > 0 ||
gap.missingAbs.length > 0 ||
gap.staleRel.length > 0
) {
await backfillNativeDroppedFiles(ctx, gap);
}

Expand All @@ -787,6 +796,101 @@ interface DroppedLanguageGap {
missingRel: string[];
/** Absolute paths, aligned by index with `missingRel`. */
missingAbs: string[];
/**
* Relative paths of WASM-only files present in DB but absent from disk (#1073).
* Rust's `detect_removed_files` filter (#1070) skips these, so the JS-side
* backfill must purge them. Always disjoint from `missingRel`.
*/
staleRel: string[];
}

/**
* Inputs to {@link computeWasmOnlyStaleFiles}. Sets are passed in so the helper
* is pure and unit-testable independently of `getInstalledWasmExtensions` and
* the `NATIVE_SUPPORTED_EXTENSIONS` global state.
*/
export interface WasmOnlyStaleFilesInput {
/** Distinct `file` values from the `nodes` table. */
existingNodes: ReadonlySet<string>;
/** Distinct `file` values from the `file_hashes` table. */
existingHashes: ReadonlySet<string>;
/** Relative paths currently on disk (from `collectFilesUtil`). */
expected: ReadonlySet<string>;
/** Lowercased extensions whose WASM grammar is installed. */
installedExts: ReadonlySet<string>;
/** Extensions covered by the Rust addon — Rust owns deletion for these. */
nativeSupported: ReadonlySet<string>;
}

/**
* Compute the WASM-only files present in the DB but missing from disk (#1073).
*
* Returns relative paths that:
* - appear in `existingNodes` or `existingHashes` (in DB),
* - are absent from `expected` (not on disk),
* - have an extension installed for WASM, AND
* - have an extension NOT covered by `nativeSupported` — Rust's
* `purge_changed_files` handles deletion for natively-supported extensions
* via its own `detect_removed_files`, so the caller must not double-purge.
*
* Extensions are lowercased before lookup to match the registry and Rust's
* `LanguageKind::from_extension` (which normalises case for the languages
* where both cases are conventional, e.g. R's `.r` / `.R`).
*
* DB paths are forced to forward slashes before comparison with `expected`
* (which is always normalised). The on-disk invariant is that DB rows are
* written with forward slashes, but a stale row written by older code on
* Windows could carry back-slashes — normalising here makes the comparison
* platform-safe and prevents false-positive purges of live rows. We replace
* `\\` explicitly (rather than calling `normalizePath`, which only touches
* `path.sep`) so the defence works when running on POSIX against a DB that
* was migrated from Windows.
*
* Exported for unit testing.
*/
export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): string[] {
const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input;
const stale: string[] = [];
const seen = new Set<string>();
const consider = (rawRel: string): void => {
const rel = rawRel.replace(/\\/g, '/');
if (expected.has(rel) || seen.has(rel)) return;
const ext = path.extname(rel).toLowerCase();
if (nativeSupported.has(ext)) return;
if (!installedExts.has(ext)) return;
seen.add(rel);
// Push the ORIGINAL raw path (not the normalised form) so the eventual
// `DELETE FROM nodes WHERE file = ?` predicate in `purgeFilesData`
// matches the actual stored row. The dedup `seen` set keeps the
// normalised form so a file written once with `\` and once with `/`
// is still treated as one entry — but the value the SQL sees has to
// be byte-identical to what's on disk in the DB.
stale.push(rawRel);
};
for (const rel of existingNodes) consider(rel);
for (const rel of existingHashes) consider(rel);
return stale;
}

/**
* Group relative paths by their lowercased extension. Shape matches the bucket
* type that `formatDropExtensionSummary` consumes, so callers can render a
* log-friendly per-extension summary without going through `classifyNativeDrops`
* when the reason is already known (e.g. the stale-purge path where every path
* is guaranteed `unsupported-by-native`).
*/
function groupByExtension(relPaths: Iterable<string>): Map<string, string[]> {
const buckets = new Map<string, string[]>();
for (const rel of relPaths) {
const ext = path.extname(rel).toLowerCase();
let list = buckets.get(ext);
if (!list) {
list = [];
buckets.set(ext, list);
}
list.push(rel);
}
return buckets;
}

/**
Expand All @@ -803,6 +907,13 @@ interface DroppedLanguageGap {
* regression — excluding them keeps the warn count in
* `backfillNativeDroppedFiles` meaningful.
*
* Also detects WASM-only files deleted from disk (#1073). Rust's
* `detect_removed_files` filter (#1070) skips files outside its supported
* extensions, so deletions of WASM-only languages don't reach the native
* purge path; the rest of the backfill only inserts rows, so without this
* step stale `nodes`/`file_hashes` rows would linger across incremental
* rebuilds until the next full rebuild.
*
* Cheap (no DB handoff, no parsing): used both to gate the backfill call
* and as its working set. NativeDbProxy supports `.prepare().all()`, so
* this works whether `ctx.db` is a proxy or a real better-sqlite3
Expand Down Expand Up @@ -844,13 +955,25 @@ function detectDroppedLanguageGap(ctx: PipelineContext): DroppedLanguageGap {
missingRel.push(rel);
missingAbs.push(path.join(ctx.rootDir, rel));
}
return { missingRel, missingAbs };

const staleRel = computeWasmOnlyStaleFiles({
existingNodes,
existingHashes,
expected,
installedExts,
nativeSupported: NATIVE_SUPPORTED_EXTENSIONS,
});

return { missingRel, missingAbs, staleRel };
}

/**
* Backfill files that the native orchestrator silently dropped during parse.
* Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
*
* Also purges stale rows for WASM-only files deleted from disk (#1073), which
* Rust's `detect_removed_files` filter (#1070) skips.
*
* Accepts a pre-computed `gap` from `detectDroppedLanguageGap` so the caller
* can use the same scan for both gating and the actual backfill — avoiding
* a redundant fs walk when the orchestrator's signals already triggered.
Expand All @@ -859,8 +982,8 @@ async function backfillNativeDroppedFiles(
ctx: PipelineContext,
gap: DroppedLanguageGap,
): Promise<void> {
const { missingRel, missingAbs } = gap;
if (missingAbs.length === 0) return;
const { missingRel, missingAbs, staleRel } = gap;
if (missingAbs.length === 0 && staleRel.length === 0) return;

// Now that we know there's work to do, hand off to better-sqlite3 (needed
// for the INSERT path below).
Expand All @@ -870,6 +993,28 @@ async function backfillNativeDroppedFiles(
ctx.nativeFirstProxy = false;
}

const dbConn = ctx.db as unknown as BetterSqlite3Database;

// Purge WASM-only files that were deleted from disk (#1073). Rust's
// detect_removed_files skips them and the insert path below never visits
// them, so without this their rows would persist across rebuilds until the
// next full rebuild reset the DB.
if (staleRel.length > 0) {
// `computeWasmOnlyStaleFiles` guarantees every path here has an extension
// outside NATIVE_SUPPORTED_EXTENSIONS, so `classifyNativeDrops` would
// always bucket 100% into `unsupported-by-native`. Build the extension
// summary directly to avoid a redundant classification pass.
const staleByExt = groupByExtension(staleRel);
info(
`Detected ${staleRel.length} deleted WASM-only file(s) the native orchestrator skipped; purging stale rows: ${formatDropExtensionSummary(staleByExt)}`,
);
purgeFilesData(dbConn, staleRel);
}

if (missingAbs.length === 0) return;

if (missingAbs.length === 0) return;

// Classify drops so users see per-extension reasons instead of just a count
// (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust
// extractor); `native-extractor-failure` indicates a real native bug since
Expand Down Expand Up @@ -918,7 +1063,7 @@ async function backfillNativeDroppedFiles(
exportKeys.push([exp.name, exp.kind, relPath, exp.line]);
}
}
const db = ctx.db as unknown as BetterSqlite3Database;
const db = dbConn;
batchInsertNodes(db, rows);

// Mark exported symbols in batches — mirrors insertDefinitionsAndExports.
Expand Down
148 changes: 148 additions & 0 deletions tests/builder/wasm-only-stale-files.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/**
* Unit tests for `computeWasmOnlyStaleFiles` (#1073).
*
* The Rust orchestrator's `detect_removed_files` filter (#1070) skips files
* outside its supported extensions, so deletions of WASM-only languages don't
* reach the native purge path. The JS-side backfill only inserts rows, so
* without this helper a deleted WASM-only file would leak `nodes`/`file_hashes`
* rows until the next full rebuild.
*
* These tests pass the extension sets as parameters so they remain meaningful
* even when every currently-registered language is natively supported
* (i.e. `installedExts == nativeSupported`). The bug surface re-opens any time
* a new WASM-only language enters the registry before its Rust extractor.
*/
import { describe, expect, it } from 'vitest';
import { computeWasmOnlyStaleFiles } from '../../src/domain/graph/builder/pipeline.js';

const NATIVE = new Set(['.ts', '.js', '.r']);
const INSTALLED = new Set(['.ts', '.js', '.r', '.gleam', '.foo']);

describe('computeWasmOnlyStaleFiles', () => {
it('returns WASM-only files present in DB but missing from disk', () => {
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/a.gleam', 'src/b.ts']),
existingHashes: new Set(['src/a.gleam', 'src/b.ts']),
expected: new Set(['src/b.ts']),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual(['src/a.gleam']);
});

it('skips natively-supported extensions — Rust owns their deletion path', () => {
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/old.ts', 'src/old.r']),
existingHashes: new Set(['src/old.ts', 'src/old.r']),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual([]);
});

it('skips extensions with no installed WASM grammar', () => {
// .bar is not in installedExts — neither engine can parse it, so DB rows
// for it can't have been written by this codepath. Leave them alone.
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/x.bar']),
existingHashes: new Set(['src/x.bar']),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual([]);
});

it('catches files that exist only in file_hashes (nodes missing)', () => {
// Legacy DB shape where file_hashes was written but `nodes` was not — the
// backfill should still recognise the file_hashes row as stale.
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(),
existingHashes: new Set(['src/leftover.gleam']),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual(['src/leftover.gleam']);
});

it('catches files that exist only in nodes (file_hashes missing)', () => {
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/leftover.gleam']),
existingHashes: new Set(),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual(['src/leftover.gleam']);
});

it('deduplicates files appearing in both nodes and file_hashes', () => {
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/dup.gleam']),
existingHashes: new Set(['src/dup.gleam']),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual(['src/dup.gleam']);
});

it('lowercases extensions to match registry/Rust normalisation', () => {
// R is conventionally written `.R` on disk. The registry and the Rust
// `LanguageKind::from_extension` accept both cases; `installedExts` and
// `nativeSupported` carry the lowercase canonical form.
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/Plot.R']),
existingHashes: new Set(['src/Plot.R']),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
// .R lowercases to .r which IS native-supported, so it should be skipped.
expect(stale).toEqual([]);
});

it('returns empty when DB and disk agree', () => {
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/a.gleam', 'src/b.ts']),
existingHashes: new Set(['src/a.gleam', 'src/b.ts']),
expected: new Set(['src/a.gleam', 'src/b.ts']),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual([]);
});

it('normalises DB paths with back-slashes against forward-slash expected set', () => {
// Defends against false-positive purges on Windows where a stale DB row
// (written by older code) could carry back-slashes while `expected` is
// always normalised. Without `normalizePath` inside `consider`, the file
// would look stale and be purged even though it exists on disk.
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src\\live.gleam']),
existingHashes: new Set(['src\\live.gleam']),
expected: new Set(['src/live.gleam']),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual([]);
});

it('preserves back-slash form so DELETE matches the actual DB row', () => {
// Counterpart to the previous test: when a back-slash DB row is GENUINELY
// stale (file no longer on disk), the returned path must keep its raw form
// so `purgeFilesData`'s `DELETE FROM nodes WHERE file = ?` matches the
// stored row. Pushing the forward-slash-normalised form would let the
// stale row silently persist — exactly the regression #1073 fixes.
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src\\dead.gleam']),
existingHashes: new Set(['src\\dead.gleam']),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual(['src\\dead.gleam']);
});
});
Loading