From 4599421b24e42f7d34e03c47371e9a6a163daa22 Mon Sep 17 00:00:00 2001 From: arttttt Date: Sat, 30 May 2026 00:17:42 +0300 Subject: [PATCH 1/5] fix(extraction): don't hang daemon init when the parse worker fails to load grammars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The parse worker's grammar-load handshake was awaited with a bare `once('message')` that only resolved on `grammars-loaded`. If the worker died while loading grammars (a tree-sitter WASM abort), that message never arrived, so the await — and the in-process `indexMutex` it runs under — hung forever. In the shared daemon (one process across all MCP clients), this wedged the background catch-up sync, so the first tool call and every client that connected afterwards hung on init with no fallback, regardless of project size. `ensureWorker` now awaits via the new exported `awaitWorkerGrammarLoad`, which settles on the first of grammars-loaded / grammars-load-failed / worker error / worker exit / timeout. On failure the worker is torn down and parsing degrades to in-process (slower but correct) for the rest of the run. The worker's `load-grammars` handler also reports JS-level load failures instead of dying silently. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/extraction/index.ts | 138 +++++++++++++++++++++++++++------ src/extraction/parse-worker.ts | 15 +++- 2 files changed, 129 insertions(+), 24 deletions(-) diff --git a/src/extraction/index.ts b/src/extraction/index.ts index 42037d7f6..98a679feb 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -39,6 +39,14 @@ const FILE_IO_BATCH_SIZE = 10; */ const PARSE_TIMEOUT_MS = 10_000; +/** + * Backstop for the one-time grammar-load handshake with a fresh worker. The + * worker normally answers `grammars-loaded` in well under a second; this only + * fires if it goes silent without crashing. Worker crash/exit during load is + * caught immediately (no need to wait this out) — see `ensureWorker`. + */ +const GRAMMAR_LOAD_TIMEOUT_MS = 30_000; + /** * Number of files to parse before recycling the worker thread. * WASM linear memory can grow but NEVER shrink (WebAssembly spec limitation). @@ -48,6 +56,60 @@ const PARSE_TIMEOUT_MS = 10_000; */ const WORKER_RECYCLE_INTERVAL = 250; +/** + * Minimal shape of a parse worker needed for the grammar-load handshake. The + * real `worker_threads.Worker` satisfies it; tests pass a fake EventEmitter. + */ +export interface GrammarLoadWorker { + on(event: string, cb: (...args: any[]) => void): void; + off(event: string, cb: (...args: any[]) => void): void; + once(event: string, cb: (...args: any[]) => void): void; + postMessage(msg: unknown): void; +} + +/** + * Await a fresh worker's one-time grammar-load handshake, settling on the FIRST + * of: `grammars-loaded` (resolve) / `grammars-load-failed` / worker `error` / + * worker `exit` / timeout (all reject). Listeners are always removed and the + * timer cleared on settle. + * + * Regression guard (daemon-init-hang fix): the bare `once('message')` this + * replaced only resolved on `grammars-loaded` and never settled when the worker + * DIED loading grammars (a tree-sitter WASM abort). The caller's await — and the + * in-process `indexMutex` it runs under — then hung forever, which in a + * long-lived MCP daemon wedged initialization for every client that connected. + */ +export function awaitWorkerGrammarLoad( + worker: GrammarLoadWorker, + languages: Language[], + timeoutMs: number = GRAMMAR_LOAD_TIMEOUT_MS +): Promise { + return new Promise((resolve, reject) => { + let settled = false; + const finish = (err?: Error) => { + if (settled) return; + settled = true; + clearTimeout(timer); + worker.off('message', onMessage); + worker.off('error', onError); + worker.off('exit', onExit); + if (err) reject(err); else resolve(); + }; + const onMessage = (msg: { type: string; error?: string }) => { + if (msg.type === 'grammars-loaded') finish(); + else if (msg.type === 'grammars-load-failed') finish(new Error(`worker failed to load grammars: ${msg.error ?? 'unknown error'}`)); + }; + const onError = (err: Error) => finish(err); + const onExit = (code: number) => finish(new Error(`worker exited (code ${code}) before grammars loaded`)); + const timer = setTimeout(() => finish(new Error(`grammar load timed out after ${timeoutMs}ms`)), timeoutMs); + timer.unref?.(); + worker.on('message', onMessage); + worker.once('error', onError); + worker.once('exit', onExit); + worker.postMessage({ type: 'load-grammars', languages }); + }); +} + /** * Progress callback for indexing operations */ @@ -682,6 +744,22 @@ export class ExtractionOrchestrator { await loadGrammarsForLanguages(neededLanguages); } + // Set true if the worker becomes unusable mid-run (e.g. it crashed while + // loading grammars). From then on we parse in-process so a broken worker + // degrades to slower-but-correct parsing instead of hanging the run. + let workerDisabled = false; + // Grammars are already loaded on the main thread only when we never spawned + // a worker. The in-process fallback lazily loads them on first use. + let mainGrammarsLoaded = !useWorker; + + async function parseInProcess(filePath: string, content: string): Promise { + if (!mainGrammarsLoaded) { + await loadGrammarsForLanguages(neededLanguages); // idempotent — skips already-loaded + mainGrammarsLoaded = true; + } + return extractFromSource(filePath, content, detectLanguage(filePath, content), frameworkNames); + } + // --- Worker lifecycle management --- // The worker can crash (OOM in WASM) or hang on pathological files. // We track pending parse promises and handle both cases: @@ -738,23 +816,33 @@ export class ExtractionOrchestrator { async function ensureWorker(): Promise { if (parseWorker) return parseWorker; log('Spawning new parse worker...'); - parseWorker = new WorkerClass!(parseWorkerPath); - attachWorkerHandlers(parseWorker); - - // Load grammars in the new worker - await new Promise((resolve, reject) => { - parseWorker!.once('message', (msg: { type: string }) => { - if (msg.type === 'grammars-loaded') resolve(); - else reject(new Error(`Unexpected message: ${msg.type}`)); - }); - parseWorker!.postMessage({ type: 'load-grammars', languages: neededLanguages }); - }); + const w = new WorkerClass!(parseWorkerPath); + parseWorker = w; + attachWorkerHandlers(w); + + // Wait for grammars to load — bounded, and rejects (never hangs) if the + // worker dies mid-load. See `awaitWorkerGrammarLoad`. + try { + await awaitWorkerGrammarLoad(w, neededLanguages); + } catch (err) { + // Worker is unusable — tear it down and let the caller degrade. + if (parseWorker === w) parseWorker = null; + try { void w.terminate(); } catch { /* ignore */ } + throw err instanceof Error ? err : new Error(String(err)); + } - return parseWorker; + return w; } if (WorkerClass) { - await ensureWorker(); + try { + await ensureWorker(); + } catch (err) { + workerDisabled = true; + logWarn('Parse worker failed to start — falling back to in-process parsing', { + error: err instanceof Error ? err.message : String(err), + }); + } } /** @@ -773,14 +861,8 @@ export class ExtractionOrchestrator { } async function requestParse(filePath: string, content: string): Promise { - if (!WorkerClass) { - // In-process fallback - return extractFromSource( - filePath, - content, - detectLanguage(filePath, content), - frameworkNames - ); + if (!WorkerClass || workerDisabled) { + return parseInProcess(filePath, content); } // Recycle the worker before the next parse if we've hit the threshold. @@ -790,7 +872,19 @@ export class ExtractionOrchestrator { await recycleWorker(); } - const worker = await ensureWorker(); + let worker: import('worker_threads').Worker; + try { + worker = await ensureWorker(); + } catch (err) { + // The worker died (e.g. crashed while loading grammars) — degrade to + // in-process parsing for the rest of this run rather than failing every + // file or hanging on a worker that will never answer. + workerDisabled = true; + logWarn('Parse worker unavailable — falling back to in-process parsing', { + error: err instanceof Error ? err.message : String(err), + }); + return parseInProcess(filePath, content); + } const id = nextId++; workerParseCount++; diff --git a/src/extraction/parse-worker.ts b/src/extraction/parse-worker.ts index 9b0010ec2..ecd5c5b34 100644 --- a/src/extraction/parse-worker.ts +++ b/src/extraction/parse-worker.ts @@ -57,8 +57,19 @@ const parseCounts = new Map(); parentPort!.on('message', async (msg: { type: string; id?: number; filePath?: string; content?: string; languages?: Language[]; frameworkNames?: string[] }) => { if (msg.type === 'load-grammars') { - await loadGrammarsForLanguages(msg.languages!); - parentPort!.postMessage({ type: 'grammars-loaded' }); + try { + await loadGrammarsForLanguages(msg.languages!); + parentPort!.postMessage({ type: 'grammars-loaded' }); + } catch (err) { + // Don't die silently: a JS-level grammar-load failure must be reported so + // the parent degrades to in-process parsing instead of waiting forever + // for a 'grammars-loaded' that will never come. (A hard WASM abort kills + // the worker outright — the parent's 'exit'/'error' guard covers that.) + parentPort!.postMessage({ + type: 'grammars-load-failed', + error: err instanceof Error ? err.message : String(err), + }); + } } else if (msg.type === 'parse') { const { id, filePath, content, frameworkNames } = msg; try { From 8d940842cc7089938f3a94a51f05765b2910a532 Mon Sep 17 00:00:00 2001 From: arttttt Date: Sat, 30 May 2026 00:17:42 +0300 Subject: [PATCH 2/5] test(extraction): cover the parse-worker grammar-load handshake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Asserts awaitWorkerGrammarLoad settles (never hangs) on every outcome — grammars-loaded, grammars-load-failed, worker exit, worker error, and timeout — and always removes its listeners. Regression guard for the daemon init-hang. Co-Authored-By: Claude Opus 4.8 (1M context) --- __tests__/parse-worker-grammar-load.test.ts | 80 +++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 __tests__/parse-worker-grammar-load.test.ts diff --git a/__tests__/parse-worker-grammar-load.test.ts b/__tests__/parse-worker-grammar-load.test.ts new file mode 100644 index 000000000..4087c0dea --- /dev/null +++ b/__tests__/parse-worker-grammar-load.test.ts @@ -0,0 +1,80 @@ +/** + * Regression test for the MCP daemon init-hang. + * + * `ensureWorker()` used to await a fresh parse worker's grammar-load with a bare + * `once('message')` that only resolved on `grammars-loaded`. If the worker DIED + * while loading grammars (a tree-sitter WASM abort), it never sent that message, + * so the await — and the in-process `indexMutex` it runs under — hung forever. + * In the shared daemon that wedged initialization for every connecting client. + * + * `awaitWorkerGrammarLoad` must settle (never hang) on every outcome: loaded, + * load-failed, worker error, worker exit, or timeout. + */ +import { EventEmitter } from 'node:events'; +import { describe, it, expect } from 'vitest'; +import { awaitWorkerGrammarLoad } from '../src/extraction/index'; + +class FakeWorker extends EventEmitter { + posted: unknown[] = []; + postMessage(msg: unknown): void { + this.posted.push(msg); + } +} + +const noLeakedListeners = (w: FakeWorker) => + w.listenerCount('message') === 0 && + w.listenerCount('error') === 0 && + w.listenerCount('exit') === 0; + +describe('awaitWorkerGrammarLoad — daemon init-hang regression', () => { + it('posts the load-grammars request and resolves on grammars-loaded', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 1000); + expect(w.posted).toEqual([{ type: 'load-grammars', languages: ['typescript'] }]); + w.emit('message', { type: 'grammars-loaded' }); + await expect(p).resolves.toBeUndefined(); + expect(noLeakedListeners(w)).toBe(true); + }); + + it('rejects (does not hang) on grammars-load-failed', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 1000); + w.emit('message', { type: 'grammars-load-failed', error: 'bad wasm' }); + await expect(p).rejects.toThrow(/bad wasm/); + expect(noLeakedListeners(w)).toBe(true); + }); + + it('rejects (does not hang) when the worker exits before grammars load — the WASM-abort case', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 1000); + w.emit('exit', 1); + await expect(p).rejects.toThrow(/exited/); + expect(noLeakedListeners(w)).toBe(true); + }); + + it('rejects (does not hang) when the worker emits an error', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 1000); + w.emit('error', new Error('worker died')); + await expect(p).rejects.toThrow(/worker died/); + expect(noLeakedListeners(w)).toBe(true); + }); + + it('rejects on timeout when the worker goes silent — never hangs forever', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 50); + await expect(p).rejects.toThrow(/timed out/); + expect(noLeakedListeners(w)).toBe(true); + }); + + it('settles exactly once — a late worker exit after grammars-loaded is ignored', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 1000); + w.emit('message', { type: 'grammars-loaded' }); + await expect(p).resolves.toBeUndefined(); + // The promise already resolved; a later crash must not double-settle or + // throw an unhandled 'error' (listeners are gone, so emitting is a no-op). + expect(() => w.emit('exit', 1)).not.toThrow(); + expect(noLeakedListeners(w)).toBe(true); + }); +}); From 32ffd3563729c5e4badfc4563281d03a998b177d Mon Sep 17 00:00:00 2001 From: arttttt Date: Sat, 30 May 2026 00:20:06 +0300 Subject: [PATCH 3/5] fix(mcp): bound the catch-up gate so a stuck sync can't wedge the first call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defense in depth on top of the parse-worker grammar-load fix. The first tool call blocks on the engine's post-open catch-up sync (the gate) so it never serves rows for files deleted while no server was running. It awaited the gate unbounded — a sync that never settles for any reason would hang the first call and, in the shared daemon, every client that connected after. The wait is now bounded (default 120s, override CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS); on timeout the call proceeds best-effort over possibly-stale data — the same outcome the existing rejection-swallow already accepts. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/mcp/tools.ts | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 935481b8a..a60feb40b 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -255,6 +255,19 @@ function adaptiveExploreEnabled(): boolean { return process.env.CODEGRAPH_ADAPTIVE_EXPLORE !== '0' && process.env.CODEGRAPH_ADAPTIVE_EXPLORE !== 'false'; } +/** + * Upper bound (ms) on how long the first tool call waits for the catch-up gate + * before serving best-effort. Defense in depth: a catch-up sync that never + * settles must not hang the first call — and, in the shared daemon, every + * client. Override via `CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS`; default 120s (well + * above any real incremental reconcile). + */ +function catchUpGateTimeoutMs(): number { + const raw = process.env.CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS; + const n = raw ? Number.parseInt(raw, 10) : NaN; + return Number.isFinite(n) && n > 0 ? n : 120_000; +} + /** * Prefix each line of a source slice with its 1-based line number, matching * the Read tool's `cat -n` convention (number + tab) so the agent treats it @@ -1042,7 +1055,18 @@ export class ToolHandler { if (this.catchUpGate) { const gate = this.catchUpGate; this.catchUpGate = null; - try { await gate; } catch { /* engine already logged */ } + // Bound the wait: a catch-up sync that never settles (for any reason) + // must not wedge the first call. On timeout we proceed best-effort over + // possibly-stale data — the same outcome the rejection-swallow accepts. + try { + await Promise.race([ + gate, + new Promise((resolve) => { + const t = setTimeout(resolve, catchUpGateTimeoutMs()); + t.unref?.(); + }), + ]); + } catch { /* engine already logged */ } } // Honor the optional tool allowlist (CODEGRAPH_MCP_TOOLS): a trimmed // surface rejects ablated tools defensively even if a client cached them. From 1efd2fc3f51832ced9d6dd34f276443c20311bb0 Mon Sep 17 00:00:00 2001 From: arttttt Date: Sat, 30 May 2026 00:20:06 +0300 Subject: [PATCH 4/5] test(mcp): cover the catch-up gate timeout fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Asserts the first tool call still serves (best-effort) when the catch-up gate never settles, instead of hanging — using a tiny CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS override. Co-Authored-By: Claude Opus 4.8 (1M context) --- __tests__/mcp-catchup-gate.test.ts | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/__tests__/mcp-catchup-gate.test.ts b/__tests__/mcp-catchup-gate.test.ts index 6baee07c4..ae0152606 100644 --- a/__tests__/mcp-catchup-gate.test.ts +++ b/__tests__/mcp-catchup-gate.test.ts @@ -119,4 +119,21 @@ describe('MCP catch-up gate', () => { expect(res.isError).toBeFalsy(); expect(res.content[0].text).toMatch(/survivor/); }); + + it('a gate that never settles does not hang the first tool call (timeout fallback)', async () => { + // Defense in depth for the daemon init-hang: if catch-up never settles for + // any reason, the first call must still serve (best-effort) after the + // bounded wait instead of hanging forever. Tiny override keeps it fast. + process.env.CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS = '50'; + try { + handler.setCatchUpGate(new Promise(() => { /* never settles */ })); + const start = Date.now(); + const res = await handler.execute('codegraph_search', { query: 'survivor' }); + expect(Date.now() - start).toBeLessThan(5000); + expect(res.isError).toBeFalsy(); + expect(res.content[0].text).toMatch(/survivor/); + } finally { + delete process.env.CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS; + } + }); }); From 15d722da4aae3c2b011fc9ef801b3a8538eb650a Mon Sep 17 00:00:00 2001 From: arttttt Date: Sat, 30 May 2026 00:20:31 +0300 Subject: [PATCH 5/5] docs(changelog): MCP startup no longer hangs on a grammar-load failure Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 21b31bb0a..1445423a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Fixes +- The MCP server no longer hangs on startup when the parser fails to load a language grammar. Previously a grammar-load crash could wedge the shared background daemon, so every editor session that connected to it hung waiting to initialize — on any project, large or small. The parser now falls back to in-process parsing instead of hanging, and a stuck startup reconcile can no longer block the first query indefinitely (tunable via `CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS`). - Indexing a project that contains only config-style files (YAML, Twig, or `.properties`) no longer misleadingly reports "No files found to index" — these files are tracked at the file level and are now counted as indexed. Thanks @luojiyin1987 (#357). ## [0.9.7] - 2026-05-28