From 5eedb33b7ffc5c3640ec588ca5dc33e3f2d7c7e7 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Mon, 1 Jun 2026 14:08:28 +0300 Subject: [PATCH 1/4] =?UTF-8?q?ci(065):=20eval.yml=20regression=20gate=20?= =?UTF-8?q?=E2=80=94=20D2=20security=20(blocking)=20+=20D1=20retrieval=20(?= =?UTF-8?q?MCP-742)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spec 065 / C1 (FR-009, US3/P2). Add `.github/workflows/eval.yml` running both Spec-065 evaluations as a regression gate over the frozen datasets: - security-d2 (blocking): provenance/license guard (FR-007/CN-005) → cmd/scan-eval ×3 → mcp-eval SecurityScorer. Thresholds --fpr-ceiling 0.10 --recall-floor 0.05 (the sensitive-data detector measures recall ≈0.10 on this corpus; scorer defaults of 0.80 would always fail). Sourced in one place pending the MCP-815 baseline `security.gate` block so gate and baseline never drift. - retrieval-d1: boots mcpproxy over snapshot-servers.config.json, waits for index readiness, runs the RetrievalScorer with baseline+tolerance. Report-only on PRs (npx/uvx fetch flake), blocking on the nightly schedule. Shared D2 logic in scripts/eval-ci-smoke.sh (CI == local). Reports upload as artifacts, never committed (CN-003, guarded). mcp-eval checked out at a pinned public ref. Verified locally: full D2 gate PASS (P=0.667 R=0.100 FPR=0.043), actionlint clean. Related #555 datasets; implements MCP-742 (Gate-2 plan rev 2 accepted). Co-Authored-By: Paperclip --- .github/workflows/eval.yml | 187 ++++++++++++++++++ scripts/eval-ci-smoke.sh | 103 ++++++++++ .../datasets/README.md | 33 ++++ 3 files changed, 323 insertions(+) create mode 100644 .github/workflows/eval.yml create mode 100755 scripts/eval-ci-smoke.sh diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml new file mode 100644 index 00000000..bf03eff3 --- /dev/null +++ b/.github/workflows/eval.yml @@ -0,0 +1,187 @@ +name: Eval (Spec 065 regression gate) + +# Spec 065 / C1 (FR-009, US3/P2): regression gate over the frozen Spec-065 +# datasets. Two independent jobs so a network flake in D1 (retrieval) never +# masks or blocks the deterministic D2 (security) gate. +# +# security-d2 — Go + Python only, no live upstreams. HARD gate (blocking). +# retrieval-d1 — needs a live mcpproxy serving 7 reference servers; network +# dependent. Report-only on PRs, blocking on the nightly +# schedule (promote to PR-blocking after a green soak — see the +# plan on MCP-742). Reports are uploaded as artifacts, never +# committed (CN-003). +# +# mcp-eval (smart-mcp-proxy/mcp-eval) is a separate PUBLIC repo, checked out at a +# pinned ref — no token needed. + +on: + pull_request: + paths: + - "cmd/scan-eval/**" + - "internal/security/**" + - "specs/065-evaluation-foundation/datasets/**" + - "scripts/eval-ci-smoke.sh" + - ".github/workflows/eval.yml" + workflow_dispatch: {} + schedule: + # Nightly soak (02:30 UTC) — exercises D1 against live upstreams. + - cron: "30 2 * * *" + +permissions: + contents: read + +env: + MCP_EVAL_REF: "76df3a47e1480bfde2433b4f19df19312c985963" # SecurityScorer (B3) merge — pin for reproducibility + PYTHON_VERSION: "3.11.13" + +jobs: + security-d2: + name: Security regression gate (D2) + runs-on: ubuntu-latest + steps: + - name: Checkout mcpproxy-go + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Set up Go + uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0 + with: + go-version: "1.25" + cache: true + + - name: Checkout mcp-eval (public, pinned) + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + with: + repository: smart-mcp-proxy/mcp-eval + ref: ${{ env.MCP_EVAL_REF }} + path: mcp-eval + + - name: Set up uv + Python + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Sync mcp-eval environment + working-directory: mcp-eval + run: uv sync + + - name: Run D2 security gate + env: + MCP_EVAL_DIR: ${{ github.workspace }}/mcp-eval + OUT_DIR: reports/security + run: bash scripts/eval-ci-smoke.sh + + - name: Upload D2 reports + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: eval-security-d2 + path: reports/security/ + retention-days: 14 + if-no-files-found: ignore + + - name: Assert reports are not committed (CN-003) + if: always() + run: | + tracked="$(git ls-files reports/ || true)" + if [ -n "$tracked" ]; then + echo "::error::Eval reports must never be committed (CN-003). Tracked under reports/:" + echo "$tracked" + exit 1 + fi + echo "OK: no eval reports are tracked by git." + + retrieval-d1: + name: Retrieval regression gate (D1) + runs-on: ubuntu-latest + # Report-only on PRs (D1 depends on npx/uvx package fetches — a known flake + # source); blocking on the nightly schedule. Promote to PR-blocking after a + # green soak (plan on MCP-742). + continue-on-error: ${{ github.event_name == 'pull_request' }} + steps: + - name: Checkout mcpproxy-go + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Set up Go + uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0 + with: + go-version: "1.25" + cache: true + + - name: Set up Node.js (npx-launched servers) + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: "22" + + - name: Checkout mcp-eval (public, pinned) + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + with: + repository: smart-mcp-proxy/mcp-eval + ref: ${{ env.MCP_EVAL_REF }} + path: mcp-eval + + - name: Set up uv + Python (uvx-launched servers + mcp-eval) + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Sync mcp-eval environment + working-directory: mcp-eval + run: uv sync + + - name: Build mcpproxy + run: go build -o mcpproxy ./cmd/mcpproxy + + - name: Start mcpproxy (7 reference servers) + run: | + ./mcpproxy serve \ + --config specs/065-evaluation-foundation/datasets/snapshot-servers.config.json \ + --data-dir "$RUNNER_TEMP/eval" \ + --listen 127.0.0.1:8092 \ + --log-level info > "$RUNNER_TEMP/mcpproxy.log" 2>&1 & + echo "MCPPROXY_PID=$!" >> "$GITHUB_ENV" + + - name: Wait for index readiness + run: | + base="http://127.0.0.1:8092"; key="eval-corpus-snapshot" + for i in $(seq 1 60); do + if curl -fsS -H "X-API-Key: $key" "$base/api/v1/status" >/dev/null 2>&1; then + n="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/index/search?q=file&limit=5" \ + | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len(d.get("tools",d.get("results",[]))))' 2>/dev/null || echo 0)" + echo "attempt $i: index search returned $n result(s)" + [ "$n" -ge 1 ] && { echo "Index ready."; exit 0; } + else + echo "attempt $i: server not up yet" + fi + sleep 5 + done + echo "::error::mcpproxy index did not become ready in time" + tail -50 "$RUNNER_TEMP/mcpproxy.log" || true + exit 1 + + - name: Run D1 retrieval gate + working-directory: mcp-eval + env: + DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets + run: | + PYTHONPATH=src uv run python -m mcp_eval.cli retrieval \ + --corpus "$DS/corpus_v1.tools.json" \ + --golden "$DS/retrieval_golden_v1.json" \ + --baseline "$DS/baseline_v1.json" \ + --tolerance 0.05 \ + --runs 1 \ + --base-url http://127.0.0.1:8092 \ + --api-key eval-corpus-snapshot \ + --out-dir "${{ github.workspace }}/reports/retrieval" + + - name: Stop mcpproxy + if: always() + run: kill "${MCPPROXY_PID}" 2>/dev/null || true + + - name: Upload D1 reports + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: eval-retrieval-d1 + path: reports/retrieval/ + retention-days: 14 + if-no-files-found: ignore diff --git a/scripts/eval-ci-smoke.sh b/scripts/eval-ci-smoke.sh new file mode 100755 index 00000000..2f625ef0 --- /dev/null +++ b/scripts/eval-ci-smoke.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# +# eval-ci-smoke.sh — Spec 065 / D2 security regression gate (and local smoke). +# +# Runs the deterministic D2 half of the Spec-065 evaluation end to end: +# 1. provenance/license guard over the security corpus (FR-007 / CN-005), +# 2. scan-eval (this repo) N times -> per-run verdict JSON (FR-010 averaging), +# 3. mcp-eval SecurityScorer gate (P/R/F1/FPR per detector, absolute thresholds). +# +# It is the single source of truth shared by `.github/workflows/eval.yml` (Job A) +# and local pre-flight, so the gate logic is proven the same way in both places. +# +# Exit non-zero if the corpus fails the provenance guard, scan-eval fails, or the +# SecurityScorer gate fails (FPR above ceiling / recall below floor). +# +# Config via env (all have CI-friendly defaults; paths may be relative to repo root): +# DATASETS_DIR dataset directory (default: specs/065-evaluation-foundation/datasets) +# MCP_EVAL_DIR checkout of smart-mcp-proxy/mcp-eval. If unset/missing, the +# SecurityScorer step is SKIPPED (Go-only smoke) with a warning. +# OUT_DIR report output dir (default: reports/security) — never committed (CN-003) +# RUNS number of scan-eval runs to average (default: 3) +# FPR_CEILING max allowed per-detector false-positive rate (default: 0.10) # matches MCP-815 +# RECALL_FLOOR min allowed detector recall (default: 0.05) # matches MCP-815 +# +# Threshold provenance (critical): the SecurityScorer *defaults* are recall-floor +# 0.80, but the production `sensitive-data` detector measures recall ~= 0.10 on +# this corpus (most malicious entries are prompt-injection / tool-poisoning / +# rug-pull — out of scope for a secret/path detector). The gate therefore uses +# the MCP-815 thresholds below, not the defaults. Once MCP-815 lands a +# `security.gate` block in baseline_v1.json, source these from there so the gate +# and the baseline never drift. +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$REPO_ROOT" + +DATASETS_DIR="${DATASETS_DIR:-specs/065-evaluation-foundation/datasets}" +OUT_DIR="${OUT_DIR:-reports/security}" +RUNS="${RUNS:-3}" +FPR_CEILING="${FPR_CEILING:-0.10}" # matches MCP-815 +RECALL_FLOOR="${RECALL_FLOOR:-0.05}" # matches MCP-815 +CORPUS="${DATASETS_DIR}/security_corpus_v1.json" + +ALLOWED_LICENSES="MIT Apache-2.0 BSD-3-Clause CC0-1.0 self-authored" + +log() { printf '\n\033[1;34m==>\033[0m %s\n' "$*"; } + +[ -f "$CORPUS" ] || { echo "error: corpus not found: $CORPUS" >&2; exit 4; } + +log "Provenance / license guard (FR-007 / CN-005) over $CORPUS" +ALLOWED_LICENSES="$ALLOWED_LICENSES" python3 - "$CORPUS" <<'PY' +import json, os, sys +corpus = json.load(open(sys.argv[1])) +entries = corpus["entries"] if isinstance(corpus, dict) else corpus +allowed = set(os.environ["ALLOWED_LICENSES"].split()) +bad = [] +for e in entries: + prov = e.get("provenance") or {} + lic = prov.get("license") + if not e.get("category"): + bad.append(f'{e.get("id","?")}: missing category') + if not prov.get("source"): + bad.append(f'{e.get("id","?")}: missing provenance.source') + if lic not in allowed: + bad.append(f'{e.get("id","?")}: license {lic!r} not in allowlist {sorted(allowed)}') +if bad: + print("PROVENANCE GUARD FAILED:", file=sys.stderr) + for b in bad: + print(" -", b, file=sys.stderr) + sys.exit(5) +print(f"OK: {len(entries)} entries, all carry category + allowlisted provenance.license") +PY + +WORK="$(mktemp -d)" +trap 'rm -rf "$WORK"' EXIT +VERDICT_ARGS=() +log "Running scan-eval x${RUNS} (deterministic detector; N for FR-010 averaging contract)" +for i in $(seq 1 "$RUNS"); do + vf="${WORK}/verdicts_${i}.json" + go run ./cmd/scan-eval --corpus "$CORPUS" --out "$vf" + VERDICT_ARGS+=(--verdicts "$vf") +done +echo "Produced ${RUNS} verdict file(s)." + +if [ -z "${MCP_EVAL_DIR:-}" ] || [ ! -d "${MCP_EVAL_DIR:-/nonexistent}" ]; then + echo "::warning::MCP_EVAL_DIR unset or missing — skipping the SecurityScorer gate (Go-only smoke). Set MCP_EVAL_DIR to a smart-mcp-proxy/mcp-eval checkout to run the full gate." + exit 0 +fi + +mkdir -p "$OUT_DIR" +ABS_CORPUS="$(cd "$(dirname "$CORPUS")" && pwd)/$(basename "$CORPUS")" +ABS_OUT="$(cd "$OUT_DIR" && pwd)" +log "SecurityScorer gate: fpr-ceiling=${FPR_CEILING} recall-floor=${RECALL_FLOOR} (MCP-815 thresholds)" +# mcp-eval is run as a module with PYTHONPATH=src (its console-script entry point +# is not installed by `uv sync`); uv supplies the synced 3.11 interpreter. +( cd "$MCP_EVAL_DIR" && PYTHONPATH=src uv run python -m mcp_eval.cli security \ + "${VERDICT_ARGS[@]}" \ + --corpus "$ABS_CORPUS" \ + --fpr-ceiling "$FPR_CEILING" \ + --recall-floor "$RECALL_FLOOR" \ + --out-dir "$ABS_OUT" ) + +log "D2 security gate PASSED — reports in $OUT_DIR" diff --git a/specs/065-evaluation-foundation/datasets/README.md b/specs/065-evaluation-foundation/datasets/README.md index 2fc616c1..5b819a71 100644 --- a/specs/065-evaluation-foundation/datasets/README.md +++ b/specs/065-evaluation-foundation/datasets/README.md @@ -107,3 +107,36 @@ text from them is vendored into this repo: - **`mcp-injection-experiments`** — LICENSE unconfirmed (research.md R-A); where it inspired a pattern, the corresponding entry was rewritten from scratch and labeled `self-authored`. The corpus test rejects any entry sourced from these. + +## CI regression gate (Spec 065 / C1) + +`.github/workflows/eval.yml` runs both evaluations as a regression gate +(FR-009). Two independent jobs keep a network flake in D1 from masking the +deterministic D2 gate: + +- **`security-d2` (D2, blocking)** — Go + Python only, no live upstreams. + Provenance/license guard over `security_corpus_v1.json` (FR-007 / CN-005), + then `cmd/scan-eval` ×3 → the mcp-eval `SecurityScorer`. Gate thresholds are + **`--fpr-ceiling 0.10 --recall-floor 0.05`** (not the scorer defaults: the + `sensitive-data` detector measures recall ≈ 0.10 here because most malicious + entries are prompt-injection / tool-poisoning / rug-pull, out of scope for a + secret/path detector). These thresholds will move to a `security.gate` block + in `baseline_v1.json` once MCP-815 lands, so the gate and baseline never drift. +- **`retrieval-d1` (D1)** — boots `mcpproxy serve` over + `snapshot-servers.config.json` (7 reference servers), waits for index + readiness, then runs the mcp-eval `RetrievalScorer` with + `--baseline baseline_v1.json --tolerance 0.05`. **Report-only on PRs** + (npx/uvx fetches are a known flake source), **blocking on the nightly + schedule**; promote to PR-blocking after a green soak. + +Both jobs upload HTML/JSON reports as run artifacts and the build **never +commits** them (CN-003). The shared D2 logic lives in `scripts/eval-ci-smoke.sh` +so the gate runs identically in CI and locally: + +```bash +# Local D2 smoke (full gate needs a mcp-eval checkout): +MCP_EVAL_DIR=/path/to/mcp-eval bash scripts/eval-ci-smoke.sh +``` + +mcp-eval (`smart-mcp-proxy/mcp-eval`, public) is checked out at a pinned ref for +reproducibility. From 0291e56f1f27ce198c750228c113ff951a6da487 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Mon, 1 Jun 2026 14:25:56 +0300 Subject: [PATCH 2/4] =?UTF-8?q?ci(065):=20fix=20D1=20retrieval=20job=20?= =?UTF-8?q?=E2=80=94=20create=20data=5Fdir=20+=20single-step=20server=20li?= =?UTF-8?q?fecycle?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retrieval-d1 job failed: `mcpproxy serve` exited immediately with "data_dir: directory does not exist" (serve refuses to create a missing data_dir), and the server was backgrounded in a separate step from the readiness poll (a process backgrounded in one step is reaped when that step's shell exits). Fix: mkdir -p the data_dir, and boot + readiness-poll + run the scorer in ONE step (shared shell) with a trap that stops the server however the step ends; also fail fast if the server process dies during startup. D2 gate unaffected. Verified: mcpproxy boots and serves /api/v1/status locally with the created data_dir; actionlint clean. Related #555 datasets; MCP-742. Co-Authored-By: Paperclip --- .github/workflows/eval.yml | 52 +++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index bf03eff3..fc4207f2 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -131,51 +131,57 @@ jobs: - name: Build mcpproxy run: go build -o mcpproxy ./cmd/mcpproxy - - name: Start mcpproxy (7 reference servers) + # Boot, poll and score in ONE step: a server backgrounded in a separate + # step is reaped when that step's shell exits, so start + wait + run must + # share a shell. The trap stops the server however this step ends. + - name: Run D1 retrieval gate (boot mcpproxy + score) + working-directory: ${{ github.workspace }} + env: + DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets run: | + set -uo pipefail + base="http://127.0.0.1:8092"; key="eval-corpus-snapshot" + # data_dir must exist — `serve` refuses to create a missing one. + mkdir -p "$RUNNER_TEMP/eval" ./mcpproxy serve \ - --config specs/065-evaluation-foundation/datasets/snapshot-servers.config.json \ + --config "$DS/snapshot-servers.config.json" \ --data-dir "$RUNNER_TEMP/eval" \ --listen 127.0.0.1:8092 \ --log-level info > "$RUNNER_TEMP/mcpproxy.log" 2>&1 & - echo "MCPPROXY_PID=$!" >> "$GITHUB_ENV" + server_pid=$! + trap 'kill "$server_pid" 2>/dev/null || true' EXIT - - name: Wait for index readiness - run: | - base="http://127.0.0.1:8092"; key="eval-corpus-snapshot" + ready=0 for i in $(seq 1 60); do + if ! kill -0 "$server_pid" 2>/dev/null; then + echo "::error::mcpproxy process exited during startup" + break + fi if curl -fsS -H "X-API-Key: $key" "$base/api/v1/status" >/dev/null 2>&1; then n="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/index/search?q=file&limit=5" \ | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len(d.get("tools",d.get("results",[]))))' 2>/dev/null || echo 0)" echo "attempt $i: index search returned $n result(s)" - [ "$n" -ge 1 ] && { echo "Index ready."; exit 0; } + [ "$n" -ge 1 ] && { ready=1; echo "Index ready."; break; } else echo "attempt $i: server not up yet" fi sleep 5 done - echo "::error::mcpproxy index did not become ready in time" - tail -50 "$RUNNER_TEMP/mcpproxy.log" || true - exit 1 + if [ "$ready" != 1 ]; then + echo "::error::mcpproxy index did not become ready in time" + echo "----- mcpproxy.log (tail) -----"; tail -80 "$RUNNER_TEMP/mcpproxy.log" || true + exit 1 + fi - - name: Run D1 retrieval gate - working-directory: mcp-eval - env: - DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets - run: | - PYTHONPATH=src uv run python -m mcp_eval.cli retrieval \ + ( cd "$GITHUB_WORKSPACE/mcp-eval" && PYTHONPATH=src uv run python -m mcp_eval.cli retrieval \ --corpus "$DS/corpus_v1.tools.json" \ --golden "$DS/retrieval_golden_v1.json" \ --baseline "$DS/baseline_v1.json" \ --tolerance 0.05 \ --runs 1 \ - --base-url http://127.0.0.1:8092 \ - --api-key eval-corpus-snapshot \ - --out-dir "${{ github.workspace }}/reports/retrieval" - - - name: Stop mcpproxy - if: always() - run: kill "${MCPPROXY_PID}" 2>/dev/null || true + --base-url "$base" \ + --api-key "$key" \ + --out-dir "$GITHUB_WORKSPACE/reports/retrieval" ) - name: Upload D1 reports if: always() From 27b07cde6abadcfd7fdf3e40507f5b31dea48367 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Mon, 1 Jun 2026 14:38:20 +0300 Subject: [PATCH 3/4] =?UTF-8?q?ci(065):=20fix=20D1=20readiness=20probe=20?= =?UTF-8?q?=E2=80=94=20parse=20data.results=20from=20index/search=20envelo?= =?UTF-8?q?pe?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retrieval-d1 readiness poll never passed: mcpproxy booted and indexed all 7 servers (45 tools), but the probe parsed the index/search response at the top level while results are nested under the `{"success":true,"data":{"results":[…]}}` envelope, so it read 0 every attempt and timed out. Fix: parse `data.results`. Verified locally — index returns 5 results for q=file within ~6s of boot. actionlint clean. Related #555 datasets; MCP-742. Co-Authored-By: Paperclip --- .github/workflows/eval.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index fc4207f2..18b33614 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -158,8 +158,9 @@ jobs: break fi if curl -fsS -H "X-API-Key: $key" "$base/api/v1/status" >/dev/null 2>&1; then + # /api/v1/index/search wraps results as {"success":true,"data":{"results":[...]}} n="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/index/search?q=file&limit=5" \ - | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len(d.get("tools",d.get("results",[]))))' 2>/dev/null || echo 0)" + | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("results", [])))' 2>/dev/null || echo 0)" echo "attempt $i: index search returned $n result(s)" [ "$n" -ge 1 ] && { ready=1; echo "Index ready."; break; } else From a1a95f4c12c61b32945da73d3c182ba140d502cd Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Mon, 1 Jun 2026 14:48:53 +0300 Subject: [PATCH 4/4] ci(065): D1 readiness waits for full tool catalog before scoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retrieval scorer was running against a partially-indexed instance: the readiness probe passed at the first indexed tool (>=1 search result), so scoring started before all 7 reference servers connected -> Recall@5 measured 0.387 vs baseline threshold 0.631 (false regression). Fix: poll /api/v1/tools until the catalog reaches the near-full count (~45 tools across the 7 servers) and add a short settle for the index build, then score. Verified locally end-to-end on a fully-indexed instance: Recall@1/3/5/10 = 0.418/0.560/0.681/0.791, Gate(recall_at_5) PASS (0.681 vs 0.631) — the baseline is exactly reproducible. actionlint clean. Related #555 datasets; MCP-742. Co-Authored-By: Paperclip --- .github/workflows/eval.yml | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index 18b33614..74727e20 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -151,25 +151,29 @@ jobs: server_pid=$! trap 'kill "$server_pid" 2>/dev/null || true' EXIT + # Wait for the FULL tool catalog before scoring: the retrieval index is + # built from the connected servers' tools, and scoring a partially + # indexed instance tanks recall (a ≥1-result check fires far too early). + # The 7 reference servers expose ~45 tools; require near-full + a short + # settle for the index build. /api/v1/tools wraps as + # {"success":true,"data":{"tools":[...]}}. ready=0 + expected=44 for i in $(seq 1 60); do if ! kill -0 "$server_pid" 2>/dev/null; then echo "::error::mcpproxy process exited during startup" break fi - if curl -fsS -H "X-API-Key: $key" "$base/api/v1/status" >/dev/null 2>&1; then - # /api/v1/index/search wraps results as {"success":true,"data":{"results":[...]}} - n="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/index/search?q=file&limit=5" \ - | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("results", [])))' 2>/dev/null || echo 0)" - echo "attempt $i: index search returned $n result(s)" - [ "$n" -ge 1 ] && { ready=1; echo "Index ready."; break; } - else - echo "attempt $i: server not up yet" + t="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/tools" \ + | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("tools", [])))' 2>/dev/null || echo 0)" + echo "attempt $i: catalog has $t tool(s)" + if [ "$t" -ge "$expected" ]; then + ready=1; echo "Catalog full ($t tools); settling 8s for index build."; sleep 8; break fi sleep 5 done if [ "$ready" != 1 ]; then - echo "::error::mcpproxy index did not become ready in time" + echo "::error::mcpproxy catalog did not reach ${expected} tools in time" echo "----- mcpproxy.log (tail) -----"; tail -80 "$RUNNER_TEMP/mcpproxy.log" || true exit 1 fi