From 5eedb33b7ffc5c3640ec588ca5dc33e3f2d7c7e7 Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Mon, 1 Jun 2026 14:08:28 +0300
Subject: [PATCH 1/4] =?UTF-8?q?ci(065):=20eval.yml=20regression=20gate=20?=
 =?UTF-8?q?=E2=80=94=20D2=20security=20(blocking)=20+=20D1=20retrieval=20(?=
 =?UTF-8?q?MCP-742)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Spec 065 / C1 (FR-009, US3/P2). Add `.github/workflows/eval.yml` running both
Spec-065 evaluations as a regression gate over the frozen datasets:

- security-d2 (blocking): provenance/license guard (FR-007/CN-005) → cmd/scan-eval
  ×3 → mcp-eval SecurityScorer. Thresholds --fpr-ceiling 0.10 --recall-floor 0.05
  (the sensitive-data detector measures recall ≈0.10 on this corpus; scorer
  defaults of 0.80 would always fail). Sourced in one place pending the MCP-815
  baseline `security.gate` block so gate and baseline never drift.
- retrieval-d1: boots mcpproxy over snapshot-servers.config.json, waits for index
  readiness, runs the RetrievalScorer with baseline+tolerance. Report-only on PRs
  (npx/uvx fetch flake), blocking on the nightly schedule.

Shared D2 logic in scripts/eval-ci-smoke.sh (CI == local). Reports upload as
artifacts, never committed (CN-003, guarded). mcp-eval checked out at a pinned
public ref. Verified locally: full D2 gate PASS (P=0.667 R=0.100 FPR=0.043),
actionlint clean.

Related #555 datasets; implements MCP-742 (Gate-2 plan rev 2 accepted).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 .github/workflows/eval.yml                    | 187 ++++++++++++++++++
 scripts/eval-ci-smoke.sh                      | 103 ++++++++++
 .../datasets/README.md                        |  33 ++++
 3 files changed, 323 insertions(+)
 create mode 100644 .github/workflows/eval.yml
 create mode 100755 scripts/eval-ci-smoke.sh

diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
new file mode 100644
index 00000000..bf03eff3
--- /dev/null
+++ b/.github/workflows/eval.yml
@@ -0,0 +1,187 @@
+name: Eval (Spec 065 regression gate)
+
+# Spec 065 / C1 (FR-009, US3/P2): regression gate over the frozen Spec-065
+# datasets. Two independent jobs so a network flake in D1 (retrieval) never
+# masks or blocks the deterministic D2 (security) gate.
+#
+#   security-d2  — Go + Python only, no live upstreams. HARD gate (blocking).
+#   retrieval-d1 — needs a live mcpproxy serving 7 reference servers; network
+#                  dependent. Report-only on PRs, blocking on the nightly
+#                  schedule (promote to PR-blocking after a green soak — see the
+#                  plan on MCP-742). Reports are uploaded as artifacts, never
+#                  committed (CN-003).
+#
+# mcp-eval (smart-mcp-proxy/mcp-eval) is a separate PUBLIC repo, checked out at a
+# pinned ref — no token needed.
+
+on:
+  pull_request:
+    paths:
+      - "cmd/scan-eval/**"
+      - "internal/security/**"
+      - "specs/065-evaluation-foundation/datasets/**"
+      - "scripts/eval-ci-smoke.sh"
+      - ".github/workflows/eval.yml"
+  workflow_dispatch: {}
+  schedule:
+    # Nightly soak (02:30 UTC) — exercises D1 against live upstreams.
+    - cron: "30 2 * * *"
+
+permissions:
+  contents: read
+
+env:
+  MCP_EVAL_REF: "76df3a47e1480bfde2433b4f19df19312c985963" # SecurityScorer (B3) merge — pin for reproducibility
+  PYTHON_VERSION: "3.11.13"
+
+jobs:
+  security-d2:
+    name: Security regression gate (D2)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout mcpproxy-go
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+
+      - name: Set up Go
+        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0
+        with:
+          go-version: "1.25"
+          cache: true
+
+      - name: Checkout mcp-eval (public, pinned)
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+        with:
+          repository: smart-mcp-proxy/mcp-eval
+          ref: ${{ env.MCP_EVAL_REF }}
+          path: mcp-eval
+
+      - name: Set up uv + Python
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Sync mcp-eval environment
+        working-directory: mcp-eval
+        run: uv sync
+
+      - name: Run D2 security gate
+        env:
+          MCP_EVAL_DIR: ${{ github.workspace }}/mcp-eval
+          OUT_DIR: reports/security
+        run: bash scripts/eval-ci-smoke.sh
+
+      - name: Upload D2 reports
+        if: always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: eval-security-d2
+          path: reports/security/
+          retention-days: 14
+          if-no-files-found: ignore
+
+      - name: Assert reports are not committed (CN-003)
+        if: always()
+        run: |
+          tracked="$(git ls-files reports/ || true)"
+          if [ -n "$tracked" ]; then
+            echo "::error::Eval reports must never be committed (CN-003). Tracked under reports/:"
+            echo "$tracked"
+            exit 1
+          fi
+          echo "OK: no eval reports are tracked by git."
+
+  retrieval-d1:
+    name: Retrieval regression gate (D1)
+    runs-on: ubuntu-latest
+    # Report-only on PRs (D1 depends on npx/uvx package fetches — a known flake
+    # source); blocking on the nightly schedule. Promote to PR-blocking after a
+    # green soak (plan on MCP-742).
+    continue-on-error: ${{ github.event_name == 'pull_request' }}
+    steps:
+      - name: Checkout mcpproxy-go
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+
+      - name: Set up Go
+        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0
+        with:
+          go-version: "1.25"
+          cache: true
+
+      - name: Set up Node.js (npx-launched servers)
+        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+        with:
+          node-version: "22"
+
+      - name: Checkout mcp-eval (public, pinned)
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+        with:
+          repository: smart-mcp-proxy/mcp-eval
+          ref: ${{ env.MCP_EVAL_REF }}
+          path: mcp-eval
+
+      - name: Set up uv + Python (uvx-launched servers + mcp-eval)
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Sync mcp-eval environment
+        working-directory: mcp-eval
+        run: uv sync
+
+      - name: Build mcpproxy
+        run: go build -o mcpproxy ./cmd/mcpproxy
+
+      - name: Start mcpproxy (7 reference servers)
+        run: |
+          ./mcpproxy serve \
+            --config specs/065-evaluation-foundation/datasets/snapshot-servers.config.json \
+            --data-dir "$RUNNER_TEMP/eval" \
+            --listen 127.0.0.1:8092 \
+            --log-level info > "$RUNNER_TEMP/mcpproxy.log" 2>&1 &
+          echo "MCPPROXY_PID=$!" >> "$GITHUB_ENV"
+
+      - name: Wait for index readiness
+        run: |
+          base="http://127.0.0.1:8092"; key="eval-corpus-snapshot"
+          for i in $(seq 1 60); do
+            if curl -fsS -H "X-API-Key: $key" "$base/api/v1/status" >/dev/null 2>&1; then
+              n="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/index/search?q=file&limit=5" \
+                   | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len(d.get("tools",d.get("results",[]))))' 2>/dev/null || echo 0)"
+              echo "attempt $i: index search returned $n result(s)"
+              [ "$n" -ge 1 ] && { echo "Index ready."; exit 0; }
+            else
+              echo "attempt $i: server not up yet"
+            fi
+            sleep 5
+          done
+          echo "::error::mcpproxy index did not become ready in time"
+          tail -50 "$RUNNER_TEMP/mcpproxy.log" || true
+          exit 1
+
+      - name: Run D1 retrieval gate
+        working-directory: mcp-eval
+        env:
+          DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets
+        run: |
+          PYTHONPATH=src uv run python -m mcp_eval.cli retrieval \
+            --corpus "$DS/corpus_v1.tools.json" \
+            --golden "$DS/retrieval_golden_v1.json" \
+            --baseline "$DS/baseline_v1.json" \
+            --tolerance 0.05 \
+            --runs 1 \
+            --base-url http://127.0.0.1:8092 \
+            --api-key eval-corpus-snapshot \
+            --out-dir "${{ github.workspace }}/reports/retrieval"
+
+      - name: Stop mcpproxy
+        if: always()
+        run: kill "${MCPPROXY_PID}" 2>/dev/null || true
+
+      - name: Upload D1 reports
+        if: always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: eval-retrieval-d1
+          path: reports/retrieval/
+          retention-days: 14
+          if-no-files-found: ignore
diff --git a/scripts/eval-ci-smoke.sh b/scripts/eval-ci-smoke.sh
new file mode 100755
index 00000000..2f625ef0
--- /dev/null
+++ b/scripts/eval-ci-smoke.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+#
+# eval-ci-smoke.sh — Spec 065 / D2 security regression gate (and local smoke).
+#
+# Runs the deterministic D2 half of the Spec-065 evaluation end to end:
+#   1. provenance/license guard over the security corpus (FR-007 / CN-005),
+#   2. scan-eval (this repo) N times -> per-run verdict JSON (FR-010 averaging),
+#   3. mcp-eval SecurityScorer gate (P/R/F1/FPR per detector, absolute thresholds).
+#
+# It is the single source of truth shared by `.github/workflows/eval.yml` (Job A)
+# and local pre-flight, so the gate logic is proven the same way in both places.
+#
+# Exit non-zero if the corpus fails the provenance guard, scan-eval fails, or the
+# SecurityScorer gate fails (FPR above ceiling / recall below floor).
+#
+# Config via env (all have CI-friendly defaults; paths may be relative to repo root):
+#   DATASETS_DIR   dataset directory (default: specs/065-evaluation-foundation/datasets)
+#   MCP_EVAL_DIR   checkout of smart-mcp-proxy/mcp-eval. If unset/missing, the
+#                  SecurityScorer step is SKIPPED (Go-only smoke) with a warning.
+#   OUT_DIR        report output dir (default: reports/security) — never committed (CN-003)
+#   RUNS           number of scan-eval runs to average (default: 3)
+#   FPR_CEILING    max allowed per-detector false-positive rate (default: 0.10)  # matches MCP-815
+#   RECALL_FLOOR   min allowed detector recall (default: 0.05)                   # matches MCP-815
+#
+# Threshold provenance (critical): the SecurityScorer *defaults* are recall-floor
+# 0.80, but the production `sensitive-data` detector measures recall ~= 0.10 on
+# this corpus (most malicious entries are prompt-injection / tool-poisoning /
+# rug-pull — out of scope for a secret/path detector). The gate therefore uses
+# the MCP-815 thresholds below, not the defaults. Once MCP-815 lands a
+# `security.gate` block in baseline_v1.json, source these from there so the gate
+# and the baseline never drift.
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$REPO_ROOT"
+
+DATASETS_DIR="${DATASETS_DIR:-specs/065-evaluation-foundation/datasets}"
+OUT_DIR="${OUT_DIR:-reports/security}"
+RUNS="${RUNS:-3}"
+FPR_CEILING="${FPR_CEILING:-0.10}"   # matches MCP-815
+RECALL_FLOOR="${RECALL_FLOOR:-0.05}" # matches MCP-815
+CORPUS="${DATASETS_DIR}/security_corpus_v1.json"
+
+ALLOWED_LICENSES="MIT Apache-2.0 BSD-3-Clause CC0-1.0 self-authored"
+
+log() { printf '\n\033[1;34m==>\033[0m %s\n' "$*"; }
+
+[ -f "$CORPUS" ] || { echo "error: corpus not found: $CORPUS" >&2; exit 4; }
+
+log "Provenance / license guard (FR-007 / CN-005) over $CORPUS"
+ALLOWED_LICENSES="$ALLOWED_LICENSES" python3 - "$CORPUS" <<'PY'
+import json, os, sys
+corpus = json.load(open(sys.argv[1]))
+entries = corpus["entries"] if isinstance(corpus, dict) else corpus
+allowed = set(os.environ["ALLOWED_LICENSES"].split())
+bad = []
+for e in entries:
+    prov = e.get("provenance") or {}
+    lic = prov.get("license")
+    if not e.get("category"):
+        bad.append(f'{e.get("id","?")}: missing category')
+    if not prov.get("source"):
+        bad.append(f'{e.get("id","?")}: missing provenance.source')
+    if lic not in allowed:
+        bad.append(f'{e.get("id","?")}: license {lic!r} not in allowlist {sorted(allowed)}')
+if bad:
+    print("PROVENANCE GUARD FAILED:", file=sys.stderr)
+    for b in bad:
+        print("  -", b, file=sys.stderr)
+    sys.exit(5)
+print(f"OK: {len(entries)} entries, all carry category + allowlisted provenance.license")
+PY
+
+WORK="$(mktemp -d)"
+trap 'rm -rf "$WORK"' EXIT
+VERDICT_ARGS=()
+log "Running scan-eval x${RUNS} (deterministic detector; N for FR-010 averaging contract)"
+for i in $(seq 1 "$RUNS"); do
+  vf="${WORK}/verdicts_${i}.json"
+  go run ./cmd/scan-eval --corpus "$CORPUS" --out "$vf"
+  VERDICT_ARGS+=(--verdicts "$vf")
+done
+echo "Produced ${RUNS} verdict file(s)."
+
+if [ -z "${MCP_EVAL_DIR:-}" ] || [ ! -d "${MCP_EVAL_DIR:-/nonexistent}" ]; then
+  echo "::warning::MCP_EVAL_DIR unset or missing — skipping the SecurityScorer gate (Go-only smoke). Set MCP_EVAL_DIR to a smart-mcp-proxy/mcp-eval checkout to run the full gate."
+  exit 0
+fi
+
+mkdir -p "$OUT_DIR"
+ABS_CORPUS="$(cd "$(dirname "$CORPUS")" && pwd)/$(basename "$CORPUS")"
+ABS_OUT="$(cd "$OUT_DIR" && pwd)"
+log "SecurityScorer gate: fpr-ceiling=${FPR_CEILING} recall-floor=${RECALL_FLOOR} (MCP-815 thresholds)"
+# mcp-eval is run as a module with PYTHONPATH=src (its console-script entry point
+# is not installed by `uv sync`); uv supplies the synced 3.11 interpreter.
+( cd "$MCP_EVAL_DIR" && PYTHONPATH=src uv run python -m mcp_eval.cli security \
+    "${VERDICT_ARGS[@]}" \
+    --corpus "$ABS_CORPUS" \
+    --fpr-ceiling "$FPR_CEILING" \
+    --recall-floor "$RECALL_FLOOR" \
+    --out-dir "$ABS_OUT" )
+
+log "D2 security gate PASSED — reports in $OUT_DIR"
diff --git a/specs/065-evaluation-foundation/datasets/README.md b/specs/065-evaluation-foundation/datasets/README.md
index 2fc616c1..5b819a71 100644
--- a/specs/065-evaluation-foundation/datasets/README.md
+++ b/specs/065-evaluation-foundation/datasets/README.md
@@ -107,3 +107,36 @@ text from them is vendored into this repo:
 - **`mcp-injection-experiments`** — LICENSE unconfirmed (research.md R-A); where it
   inspired a pattern, the corresponding entry was rewritten from scratch and
   labeled `self-authored`. The corpus test rejects any entry sourced from these.
+
+## CI regression gate (Spec 065 / C1)
+
+`.github/workflows/eval.yml` runs both evaluations as a regression gate
+(FR-009). Two independent jobs keep a network flake in D1 from masking the
+deterministic D2 gate:
+
+- **`security-d2` (D2, blocking)** — Go + Python only, no live upstreams.
+  Provenance/license guard over `security_corpus_v1.json` (FR-007 / CN-005),
+  then `cmd/scan-eval` ×3 → the mcp-eval `SecurityScorer`. Gate thresholds are
+  **`--fpr-ceiling 0.10 --recall-floor 0.05`** (not the scorer defaults: the
+  `sensitive-data` detector measures recall ≈ 0.10 here because most malicious
+  entries are prompt-injection / tool-poisoning / rug-pull, out of scope for a
+  secret/path detector). These thresholds will move to a `security.gate` block
+  in `baseline_v1.json` once MCP-815 lands, so the gate and baseline never drift.
+- **`retrieval-d1` (D1)** — boots `mcpproxy serve` over
+  `snapshot-servers.config.json` (7 reference servers), waits for index
+  readiness, then runs the mcp-eval `RetrievalScorer` with
+  `--baseline baseline_v1.json --tolerance 0.05`. **Report-only on PRs**
+  (npx/uvx fetches are a known flake source), **blocking on the nightly
+  schedule**; promote to PR-blocking after a green soak.
+
+Both jobs upload HTML/JSON reports as run artifacts and the build **never
+commits** them (CN-003). The shared D2 logic lives in `scripts/eval-ci-smoke.sh`
+so the gate runs identically in CI and locally:
+
+```bash
+# Local D2 smoke (full gate needs a mcp-eval checkout):
+MCP_EVAL_DIR=/path/to/mcp-eval bash scripts/eval-ci-smoke.sh
+```
+
+mcp-eval (`smart-mcp-proxy/mcp-eval`, public) is checked out at a pinned ref for
+reproducibility.

From 0291e56f1f27ce198c750228c113ff951a6da487 Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Mon, 1 Jun 2026 14:25:56 +0300
Subject: [PATCH 2/4] =?UTF-8?q?ci(065):=20fix=20D1=20retrieval=20job=20?=
 =?UTF-8?q?=E2=80=94=20create=20data=5Fdir=20+=20single-step=20server=20li?=
 =?UTF-8?q?fecycle?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The retrieval-d1 job failed: `mcpproxy serve` exited immediately with
"data_dir: directory does not exist" (serve refuses to create a missing
data_dir), and the server was backgrounded in a separate step from the readiness
poll (a process backgrounded in one step is reaped when that step's shell exits).

Fix: mkdir -p the data_dir, and boot + readiness-poll + run the scorer in ONE
step (shared shell) with a trap that stops the server however the step ends; also
fail fast if the server process dies during startup. D2 gate unaffected.

Verified: mcpproxy boots and serves /api/v1/status locally with the created
data_dir; actionlint clean.

Related #555 datasets; MCP-742.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 .github/workflows/eval.yml | 52 +++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
index bf03eff3..fc4207f2 100644
--- a/.github/workflows/eval.yml
+++ b/.github/workflows/eval.yml
@@ -131,51 +131,57 @@ jobs:
       - name: Build mcpproxy
         run: go build -o mcpproxy ./cmd/mcpproxy
 
-      - name: Start mcpproxy (7 reference servers)
+      # Boot, poll and score in ONE step: a server backgrounded in a separate
+      # step is reaped when that step's shell exits, so start + wait + run must
+      # share a shell. The trap stops the server however this step ends.
+      - name: Run D1 retrieval gate (boot mcpproxy + score)
+        working-directory: ${{ github.workspace }}
+        env:
+          DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets
         run: |
+          set -uo pipefail
+          base="http://127.0.0.1:8092"; key="eval-corpus-snapshot"
+          # data_dir must exist — `serve` refuses to create a missing one.
+          mkdir -p "$RUNNER_TEMP/eval"
           ./mcpproxy serve \
-            --config specs/065-evaluation-foundation/datasets/snapshot-servers.config.json \
+            --config "$DS/snapshot-servers.config.json" \
             --data-dir "$RUNNER_TEMP/eval" \
             --listen 127.0.0.1:8092 \
             --log-level info > "$RUNNER_TEMP/mcpproxy.log" 2>&1 &
-          echo "MCPPROXY_PID=$!" >> "$GITHUB_ENV"
+          server_pid=$!
+          trap 'kill "$server_pid" 2>/dev/null || true' EXIT
 
-      - name: Wait for index readiness
-        run: |
-          base="http://127.0.0.1:8092"; key="eval-corpus-snapshot"
+          ready=0
           for i in $(seq 1 60); do
+            if ! kill -0 "$server_pid" 2>/dev/null; then
+              echo "::error::mcpproxy process exited during startup"
+              break
+            fi
             if curl -fsS -H "X-API-Key: $key" "$base/api/v1/status" >/dev/null 2>&1; then
               n="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/index/search?q=file&limit=5" \
                    | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len(d.get("tools",d.get("results",[]))))' 2>/dev/null || echo 0)"
               echo "attempt $i: index search returned $n result(s)"
-              [ "$n" -ge 1 ] && { echo "Index ready."; exit 0; }
+              [ "$n" -ge 1 ] && { ready=1; echo "Index ready."; break; }
             else
               echo "attempt $i: server not up yet"
             fi
             sleep 5
           done
-          echo "::error::mcpproxy index did not become ready in time"
-          tail -50 "$RUNNER_TEMP/mcpproxy.log" || true
-          exit 1
+          if [ "$ready" != 1 ]; then
+            echo "::error::mcpproxy index did not become ready in time"
+            echo "----- mcpproxy.log (tail) -----"; tail -80 "$RUNNER_TEMP/mcpproxy.log" || true
+            exit 1
+          fi
 
-      - name: Run D1 retrieval gate
-        working-directory: mcp-eval
-        env:
-          DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets
-        run: |
-          PYTHONPATH=src uv run python -m mcp_eval.cli retrieval \
+          ( cd "$GITHUB_WORKSPACE/mcp-eval" && PYTHONPATH=src uv run python -m mcp_eval.cli retrieval \
             --corpus "$DS/corpus_v1.tools.json" \
             --golden "$DS/retrieval_golden_v1.json" \
             --baseline "$DS/baseline_v1.json" \
             --tolerance 0.05 \
             --runs 1 \
-            --base-url http://127.0.0.1:8092 \
-            --api-key eval-corpus-snapshot \
-            --out-dir "${{ github.workspace }}/reports/retrieval"
-
-      - name: Stop mcpproxy
-        if: always()
-        run: kill "${MCPPROXY_PID}" 2>/dev/null || true
+            --base-url "$base" \
+            --api-key "$key" \
+            --out-dir "$GITHUB_WORKSPACE/reports/retrieval" )
 
       - name: Upload D1 reports
         if: always()

From 27b07cde6abadcfd7fdf3e40507f5b31dea48367 Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Mon, 1 Jun 2026 14:38:20 +0300
Subject: [PATCH 3/4] =?UTF-8?q?ci(065):=20fix=20D1=20readiness=20probe=20?=
 =?UTF-8?q?=E2=80=94=20parse=20data.results=20from=20index/search=20envelo?=
 =?UTF-8?q?pe?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The retrieval-d1 readiness poll never passed: mcpproxy booted and indexed all 7
servers (45 tools), but the probe parsed the index/search response at the top
level while results are nested under the `{"success":true,"data":{"results":[…]}}`
envelope, so it read 0 every attempt and timed out.

Fix: parse `data.results`. Verified locally — index returns 5 results for q=file
within ~6s of boot. actionlint clean.

Related #555 datasets; MCP-742.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 .github/workflows/eval.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
index fc4207f2..18b33614 100644
--- a/.github/workflows/eval.yml
+++ b/.github/workflows/eval.yml
@@ -158,8 +158,9 @@ jobs:
               break
             fi
             if curl -fsS -H "X-API-Key: $key" "$base/api/v1/status" >/dev/null 2>&1; then
+              # /api/v1/index/search wraps results as {"success":true,"data":{"results":[...]}}
               n="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/index/search?q=file&limit=5" \
-                   | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len(d.get("tools",d.get("results",[]))))' 2>/dev/null || echo 0)"
+                   | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("results", [])))' 2>/dev/null || echo 0)"
               echo "attempt $i: index search returned $n result(s)"
               [ "$n" -ge 1 ] && { ready=1; echo "Index ready."; break; }
             else

From a1a95f4c12c61b32945da73d3c182ba140d502cd Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Mon, 1 Jun 2026 14:48:53 +0300
Subject: [PATCH 4/4] ci(065): D1 readiness waits for full tool catalog before
 scoring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The retrieval scorer was running against a partially-indexed instance: the
readiness probe passed at the first indexed tool (>=1 search result), so scoring
started before all 7 reference servers connected -> Recall@5 measured 0.387 vs
baseline threshold 0.631 (false regression).

Fix: poll /api/v1/tools until the catalog reaches the near-full count (~45 tools
across the 7 servers) and add a short settle for the index build, then score.

Verified locally end-to-end on a fully-indexed instance: Recall@1/3/5/10 =
0.418/0.560/0.681/0.791, Gate(recall_at_5) PASS (0.681 vs 0.631) — the baseline
is exactly reproducible. actionlint clean.

Related #555 datasets; MCP-742.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 .github/workflows/eval.yml | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
index 18b33614..74727e20 100644
--- a/.github/workflows/eval.yml
+++ b/.github/workflows/eval.yml
@@ -151,25 +151,29 @@ jobs:
           server_pid=$!
           trap 'kill "$server_pid" 2>/dev/null || true' EXIT
 
+          # Wait for the FULL tool catalog before scoring: the retrieval index is
+          # built from the connected servers' tools, and scoring a partially
+          # indexed instance tanks recall (a ≥1-result check fires far too early).
+          # The 7 reference servers expose ~45 tools; require near-full + a short
+          # settle for the index build. /api/v1/tools wraps as
+          # {"success":true,"data":{"tools":[...]}}.
           ready=0
+          expected=44
           for i in $(seq 1 60); do
             if ! kill -0 "$server_pid" 2>/dev/null; then
               echo "::error::mcpproxy process exited during startup"
               break
             fi
-            if curl -fsS -H "X-API-Key: $key" "$base/api/v1/status" >/dev/null 2>&1; then
-              # /api/v1/index/search wraps results as {"success":true,"data":{"results":[...]}}
-              n="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/index/search?q=file&limit=5" \
-                   | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("results", [])))' 2>/dev/null || echo 0)"
-              echo "attempt $i: index search returned $n result(s)"
-              [ "$n" -ge 1 ] && { ready=1; echo "Index ready."; break; }
-            else
-              echo "attempt $i: server not up yet"
+            t="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/tools" \
+                 | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("tools", [])))' 2>/dev/null || echo 0)"
+            echo "attempt $i: catalog has $t tool(s)"
+            if [ "$t" -ge "$expected" ]; then
+              ready=1; echo "Catalog full ($t tools); settling 8s for index build."; sleep 8; break
             fi
             sleep 5
           done
           if [ "$ready" != 1 ]; then
-            echo "::error::mcpproxy index did not become ready in time"
+            echo "::error::mcpproxy catalog did not reach ${expected} tools in time"
             echo "----- mcpproxy.log (tail) -----"; tail -80 "$RUNNER_TEMP/mcpproxy.log" || true
             exit 1
           fi