From b46dab72015dbe068f00e5f9d28e7b5353175363 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Fri, 15 May 2026 08:40:50 -0600 Subject: [PATCH 1/2] fix(bench): warmup + median for queryTimeMs to remove cold-start noise MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The build-benchmark queryTimeMs metric was a single-shot cold call to fnDepsData('buildGraph', dbPath) with no warmup, no median, and noTests:false. This conflated steady-state query latency with NAPI/rusqlite/OS-page-cache init (~65ms on macOS) and let fixture-file growth from new native extractors inflate the measurement. Local verification on the same 757-file corpus: v3.9.6: single-shot 78.8ms warmed median 4.0ms HEAD: single-shot 67.5ms warmed median 2.8ms HEAD is faster than v3.9.6 on both metrics — the +110% spike that tripped the publish gate during the R + Solidity merge (#1100, #1102) was single-shot cold-start variance, not a real regression. Switch queryTimeMs to 3 warmup runs + median of 5 with noTests:true, matching the methodology already used by query-benchmark.ts and the per-target queries.*Ms block in the same script. Update the KNOWN_REGRESSIONS comment for 3.10.0:Query time to record the methodology fix; keep the entry in place until 3.11.0+ data captures the new steady-state. Closes #1113 --- scripts/benchmark.ts | 20 +++++++++++++++++--- tests/benchmarks/regression-guard.test.ts | 23 +++++++++++++---------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/scripts/benchmark.ts b/scripts/benchmark.ts index aaacb65d..fa09c26d 100644 --- a/scripts/benchmark.ts +++ b/scripts/benchmark.ts @@ -133,9 +133,23 @@ const buildStart = performance.now(); const buildResult = await buildGraph(root, { engine, incremental: false }); const buildTimeMs = performance.now() - buildStart; -const queryStart = performance.now(); -fnDepsData('buildGraph', dbPath); -const queryTimeMs = performance.now() - queryStart; +// Warmed median of QUERY_RUNS samples with `noTests: true` to match the +// methodology used by query-benchmark.ts and the per-target `queries.*Ms` +// block below. Earlier versions of this script measured a single cold call, +// which conflated steady-state query latency with NAPI/rusqlite/OS-page-cache +// init costs (~65ms on macOS) and inflated growth from test-fixture files +// pulled in by new native extractors. See #1113 for the methodology rationale. +const QUERY_WARMUP_RUNS = 3; +for (let i = 0; i < QUERY_WARMUP_RUNS; i++) { + fnDepsData('buildGraph', dbPath, { depth: 3, noTests: true }); +} +const queryTimings: number[] = []; +for (let i = 0; i < QUERY_RUNS; i++) { + const start = performance.now(); + fnDepsData('buildGraph', dbPath, { depth: 3, noTests: true }); + queryTimings.push(performance.now() - start); +} +const queryTimeMs = median(queryTimings); const stats = statsData(dbPath); const totalFiles = stats.files.total; diff --git a/tests/benchmarks/regression-guard.test.ts b/tests/benchmarks/regression-guard.test.ts index 4125325e..a0a3abfc 100644 --- a/tests/benchmarks/regression-guard.test.ts +++ b/tests/benchmarks/regression-guard.test.ts @@ -188,16 +188,19 @@ const SKIP_VERSIONS = new Set(['3.8.0']); * one-time bump as the cost of supporting Verilog. Tracked separately; * exempt this release. * - * - 3.10.0:Query time — cumulative effect of adding two native extractors - * (Solidity #1100 + R #1102) in quick succession. Neither tripped the - * threshold individually (Solidity PR's Query time stayed at 49ms, R PR - * showed no warning), but the combined +110% (49.6 → ~105ms) on the - * `fnDepsData('buildGraph', dbPath)` measurement reflects natural graph - * growth: ~1100 LoC of new extractor code + 9 fixture files added to the - * self-build benchmark expand `buildGraph`'s transitive callee count and - * DB row counts. Tracked in #1113 — exempt this release; remove once - * 3.11.0+ data captures the new steady-state and the per-language - * fixture footprint has been evaluated. + * - 3.10.0:Query time — methodology artifact, not a real regression. The + * metric was a single-shot cold call to `fnDepsData('buildGraph', dbPath)` + * with no warmup, no median, and `noTests: false` — so it captured ~65ms + * of NAPI/rusqlite/OS-page-cache init plus the cost of walking through + * fixture files added by new language extractors. Local v3.9.6 vs HEAD + * on the same corpus measured 78.8ms vs 67.5ms single-shot (HEAD faster), + * while the warmed `queries.fnDepsMs` in the same benchmark showed 4.0ms + * vs 2.8ms — confirming no underlying regression. Methodology fixed in + * #1113: queryTimeMs now uses 3 warmup runs + median of 5 with + * `noTests: true`, matching query-benchmark.ts hygiene. Exemption kept + * in place until 3.11.0+ data captures the new steady-state under the + * updated methodology (expected ~36ms native on this corpus); remove + * the entry then. * * - 3.10.0:fnDeps depth 5 — same cause as Query time above. Merging main * into #1102 added the Erlang extractor (#1103) on top of the existing From de2212c2b981ccad2b6ce1e5808fe2af4f05108d Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Fri, 15 May 2026 09:22:32 -0600 Subject: [PATCH 2/2] fix(bench): hoist QUERY_WARMUP_RUNS, warm benchQuery independently (#1133) Addresses Greptile review feedback: - Move QUERY_WARMUP_RUNS to the run-constants block at the top of the worker section, alongside INCREMENTAL_RUNS and QUERY_RUNS, so all run counts are tunable in one place. - Add the warmup loop inside benchQuery itself instead of relying on the queryTimeMs warmup to implicitly prime caches for later calls. Each call site now warms independently, so the methodology no longer depends on call ordering. - Use benchQuery for the top-level queryTimeMs measurement to eliminate duplicated warmup/timing logic and update the comment to describe the actual parity (benchQuery is also warmed) rather than implicit coupling. --- scripts/benchmark.ts | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/scripts/benchmark.ts b/scripts/benchmark.ts index fa09c26d..f8daa0ad 100644 --- a/scripts/benchmark.ts +++ b/scripts/benchmark.ts @@ -92,6 +92,7 @@ try { const INCREMENTAL_RUNS = 3; const QUERY_RUNS = 5; +const QUERY_WARMUP_RUNS = 3; const PROBE_FILE = path.join(root, 'src', 'domain', 'queries.ts'); function median(arr) { @@ -135,21 +136,12 @@ const buildTimeMs = performance.now() - buildStart; // Warmed median of QUERY_RUNS samples with `noTests: true` to match the // methodology used by query-benchmark.ts and the per-target `queries.*Ms` -// block below. Earlier versions of this script measured a single cold call, -// which conflated steady-state query latency with NAPI/rusqlite/OS-page-cache -// init costs (~65ms on macOS) and inflated growth from test-fixture files -// pulled in by new native extractors. See #1113 for the methodology rationale. -const QUERY_WARMUP_RUNS = 3; -for (let i = 0; i < QUERY_WARMUP_RUNS; i++) { - fnDepsData('buildGraph', dbPath, { depth: 3, noTests: true }); -} -const queryTimings: number[] = []; -for (let i = 0; i < QUERY_RUNS; i++) { - const start = performance.now(); - fnDepsData('buildGraph', dbPath, { depth: 3, noTests: true }); - queryTimings.push(performance.now() - start); -} -const queryTimeMs = median(queryTimings); +// block below (which calls `benchQuery`, also warmed). Earlier versions of +// this script measured a single cold call, which conflated steady-state +// query latency with NAPI/rusqlite/OS-page-cache init costs (~65ms on +// macOS) and inflated growth from test-fixture files pulled in by new +// native extractors. See #1113 for the methodology rationale. +const queryTimeMs = benchQuery(fnDepsData, 'buildGraph', dbPath, { depth: 3, noTests: true }); const stats = statsData(dbPath); const totalFiles = stats.files.total; @@ -205,6 +197,11 @@ const targets = workerTargets() || selectTargets(); console.error(` hub=${targets.hub}, leaf=${targets.leaf}`); function benchQuery(fn, ...args) { + // Warmup runs prime NAPI bindings, the rusqlite statement cache, and the + // OS page cache so the timed loop measures steady-state query latency + // rather than first-call init (~65ms on macOS). Each call site warms + // independently — methodology does not rely on call ordering elsewhere. + for (let i = 0; i < QUERY_WARMUP_RUNS; i++) fn(...args); const timings = []; for (let i = 0; i < QUERY_RUNS; i++) { const start = performance.now();