smart-mcp-proxy · Dumbris · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -0,0 +1,198 @@
+name: Eval (Spec 065 regression gate)
+
+# Spec 065 / C1 (FR-009, US3/P2): regression gate over the frozen Spec-065
+# datasets. Two independent jobs so a network flake in D1 (retrieval) never
+# masks or blocks the deterministic D2 (security) gate.
+#
+#   security-d2  — Go + Python only, no live upstreams. HARD gate (blocking).
+#   retrieval-d1 — needs a live mcpproxy serving 7 reference servers; network
+#                  dependent. Report-only on PRs, blocking on the nightly
+#                  schedule (promote to PR-blocking after a green soak — see the
+#                  plan on MCP-742). Reports are uploaded as artifacts, never
+#                  committed (CN-003).
+#
+# mcp-eval (smart-mcp-proxy/mcp-eval) is a separate PUBLIC repo, checked out at a
+# pinned ref — no token needed.
+
+on:
+  pull_request:
+    paths:
+      - "cmd/scan-eval/**"
+      - "internal/security/**"
+      - "specs/065-evaluation-foundation/datasets/**"
+      - "scripts/eval-ci-smoke.sh"
+      - ".github/workflows/eval.yml"
+  workflow_dispatch: {}
+  schedule:
+    # Nightly soak (02:30 UTC) — exercises D1 against live upstreams.
+    - cron: "30 2 * * *"
+
+permissions:
+  contents: read
+
+env:
+  MCP_EVAL_REF: "76df3a47e1480bfde2433b4f19df19312c985963" # SecurityScorer (B3) merge — pin for reproducibility
+  PYTHON_VERSION: "3.11.13"
+
+jobs:
+  security-d2:
+    name: Security regression gate (D2)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout mcpproxy-go
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+
+      - name: Set up Go
+        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0
+        with:
+          go-version: "1.25"
+          cache: true
+
+      - name: Checkout mcp-eval (public, pinned)
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+        with:
+          repository: smart-mcp-proxy/mcp-eval
+          ref: ${{ env.MCP_EVAL_REF }}
+          path: mcp-eval
+
+      - name: Set up uv + Python
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Sync mcp-eval environment
+        working-directory: mcp-eval
+        run: uv sync
+
+      - name: Run D2 security gate
+        env:
+          MCP_EVAL_DIR: ${{ github.workspace }}/mcp-eval
+          OUT_DIR: reports/security
+        run: bash scripts/eval-ci-smoke.sh
+
+      - name: Upload D2 reports
+        if: always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: eval-security-d2
+          path: reports/security/
+          retention-days: 14
+          if-no-files-found: ignore
+
+      - name: Assert reports are not committed (CN-003)
+        if: always()
+        run: |
+          tracked="$(git ls-files reports/ || true)"
+          if [ -n "$tracked" ]; then
+            echo "::error::Eval reports must never be committed (CN-003). Tracked under reports/:"
+            echo "$tracked"
+            exit 1
+          fi
+          echo "OK: no eval reports are tracked by git."
+
+  retrieval-d1:
+    name: Retrieval regression gate (D1)
+    runs-on: ubuntu-latest
+    # Report-only on PRs (D1 depends on npx/uvx package fetches — a known flake
+    # source); blocking on the nightly schedule. Promote to PR-blocking after a
+    # green soak (plan on MCP-742).
+    continue-on-error: ${{ github.event_name == 'pull_request' }}
+    steps:
+      - name: Checkout mcpproxy-go
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+
+      - name: Set up Go
+        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0
+        with:
+          go-version: "1.25"
+          cache: true
+
+      - name: Set up Node.js (npx-launched servers)
+        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+        with:
+          node-version: "22"
+
+      - name: Checkout mcp-eval (public, pinned)
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+        with:
+          repository: smart-mcp-proxy/mcp-eval
+          ref: ${{ env.MCP_EVAL_REF }}
+          path: mcp-eval
+
+      - name: Set up uv + Python (uvx-launched servers + mcp-eval)
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Sync mcp-eval environment
+        working-directory: mcp-eval
+        run: uv sync
+
+      - name: Build mcpproxy
+        run: go build -o mcpproxy ./cmd/mcpproxy
+
+      # Boot, poll and score in ONE step: a server backgrounded in a separate
+      # step is reaped when that step's shell exits, so start + wait + run must
+      # share a shell. The trap stops the server however this step ends.
+      - name: Run D1 retrieval gate (boot mcpproxy + score)
+        working-directory: ${{ github.workspace }}
+        env:
+          DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets
+        run: |
+          set -uo pipefail
+          base="http://127.0.0.1:8092"; key="eval-corpus-snapshot"
+          # data_dir must exist — `serve` refuses to create a missing one.
+          mkdir -p "$RUNNER_TEMP/eval"
+          ./mcpproxy serve \
+            --config "$DS/snapshot-servers.config.json" \
+            --data-dir "$RUNNER_TEMP/eval" \
+            --listen 127.0.0.1:8092 \
+            --log-level info > "$RUNNER_TEMP/mcpproxy.log" 2>&1 &
+          server_pid=$!
+          trap 'kill "$server_pid" 2>/dev/null || true' EXIT
+
+          # Wait for the FULL tool catalog before scoring: the retrieval index is
+          # built from the connected servers' tools, and scoring a partially
+          # indexed instance tanks recall (a ≥1-result check fires far too early).
+          # The 7 reference servers expose ~45 tools; require near-full + a short
+          # settle for the index build. /api/v1/tools wraps as
+          # {"success":true,"data":{"tools":[...]}}.
+          ready=0
+          expected=44
+          for i in $(seq 1 60); do
+            if ! kill -0 "$server_pid" 2>/dev/null; then
+              echo "::error::mcpproxy process exited during startup"
+              break
+            fi
+            t="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/tools" \
+                 | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("tools", [])))' 2>/dev/null || echo 0)"
+            echo "attempt $i: catalog has $t tool(s)"
+            if [ "$t" -ge "$expected" ]; then
+              ready=1; echo "Catalog full ($t tools); settling 8s for index build."; sleep 8; break
+            fi
+            sleep 5
+          done
+          if [ "$ready" != 1 ]; then
+            echo "::error::mcpproxy catalog did not reach ${expected} tools in time"
+            echo "----- mcpproxy.log (tail) -----"; tail -80 "$RUNNER_TEMP/mcpproxy.log" || true
+            exit 1
+          fi
+
+          ( cd "$GITHUB_WORKSPACE/mcp-eval" && PYTHONPATH=src uv run python -m mcp_eval.cli retrieval \
+            --corpus "$DS/corpus_v1.tools.json" \
+            --golden "$DS/retrieval_golden_v1.json" \
+            --baseline "$DS/baseline_v1.json" \
+            --tolerance 0.05 \
+            --runs 1 \
+            --base-url "$base" \
+            --api-key "$key" \
+            --out-dir "$GITHUB_WORKSPACE/reports/retrieval" )
+
+      - name: Upload D1 reports
+        if: always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: eval-retrieval-d1
+          path: reports/retrieval/
+          retention-days: 14
+          if-no-files-found: ignore
diff --git a/scripts/eval-ci-smoke.sh b/scripts/eval-ci-smoke.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+#
+# eval-ci-smoke.sh — Spec 065 / D2 security regression gate (and local smoke).
+#
+# Runs the deterministic D2 half of the Spec-065 evaluation end to end:
+#   1. provenance/license guard over the security corpus (FR-007 / CN-005),
+#   2. scan-eval (this repo) N times -> per-run verdict JSON (FR-010 averaging),
+#   3. mcp-eval SecurityScorer gate (P/R/F1/FPR per detector, absolute thresholds).
+#
+# It is the single source of truth shared by `.github/workflows/eval.yml` (Job A)
+# and local pre-flight, so the gate logic is proven the same way in both places.
+#
+# Exit non-zero if the corpus fails the provenance guard, scan-eval fails, or the
+# SecurityScorer gate fails (FPR above ceiling / recall below floor).
+#
+# Config via env (all have CI-friendly defaults; paths may be relative to repo root):
+#   DATASETS_DIR   dataset directory (default: specs/065-evaluation-foundation/datasets)
+#   MCP_EVAL_DIR   checkout of smart-mcp-proxy/mcp-eval. If unset/missing, the
+#                  SecurityScorer step is SKIPPED (Go-only smoke) with a warning.
+#   OUT_DIR        report output dir (default: reports/security) — never committed (CN-003)
+#   RUNS           number of scan-eval runs to average (default: 3)
+#   FPR_CEILING    max allowed per-detector false-positive rate (default: 0.10)  # matches MCP-815
+#   RECALL_FLOOR   min allowed detector recall (default: 0.05)                   # matches MCP-815
+#
+# Threshold provenance (critical): the SecurityScorer *defaults* are recall-floor
+# 0.80, but the production `sensitive-data` detector measures recall ~= 0.10 on
+# this corpus (most malicious entries are prompt-injection / tool-poisoning /
+# rug-pull — out of scope for a secret/path detector). The gate therefore uses
+# the MCP-815 thresholds below, not the defaults. Once MCP-815 lands a
+# `security.gate` block in baseline_v1.json, source these from there so the gate
+# and the baseline never drift.
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$REPO_ROOT"
+
+DATASETS_DIR="${DATASETS_DIR:-specs/065-evaluation-foundation/datasets}"
+OUT_DIR="${OUT_DIR:-reports/security}"
+RUNS="${RUNS:-3}"
+FPR_CEILING="${FPR_CEILING:-0.10}"   # matches MCP-815
+RECALL_FLOOR="${RECALL_FLOOR:-0.05}" # matches MCP-815
+CORPUS="${DATASETS_DIR}/security_corpus_v1.json"
+
+ALLOWED_LICENSES="MIT Apache-2.0 BSD-3-Clause CC0-1.0 self-authored"
+
+log() { printf '\n\033[1;34m==>\033[0m %s\n' "$*"; }
+
+[ -f "$CORPUS" ] || { echo "error: corpus not found: $CORPUS" >&2; exit 4; }
+
+log "Provenance / license guard (FR-007 / CN-005) over $CORPUS"
+ALLOWED_LICENSES="$ALLOWED_LICENSES" python3 - "$CORPUS" <<'PY'
+import json, os, sys
+corpus = json.load(open(sys.argv[1]))
+entries = corpus["entries"] if isinstance(corpus, dict) else corpus
+allowed = set(os.environ["ALLOWED_LICENSES"].split())
+bad = []
+for e in entries:
+    prov = e.get("provenance") or {}
+    lic = prov.get("license")
+    if not e.get("category"):
+        bad.append(f'{e.get("id","?")}: missing category')
+    if not prov.get("source"):
+        bad.append(f'{e.get("id","?")}: missing provenance.source')
+    if lic not in allowed:
+        bad.append(f'{e.get("id","?")}: license {lic!r} not in allowlist {sorted(allowed)}')
+if bad:
+    print("PROVENANCE GUARD FAILED:", file=sys.stderr)
+    for b in bad:
+        print("  -", b, file=sys.stderr)
+    sys.exit(5)
+print(f"OK: {len(entries)} entries, all carry category + allowlisted provenance.license")
+PY
+
+WORK="$(mktemp -d)"
+trap 'rm -rf "$WORK"' EXIT
+VERDICT_ARGS=()
+log "Running scan-eval x${RUNS} (deterministic detector; N for FR-010 averaging contract)"
+for i in $(seq 1 "$RUNS"); do
+  vf="${WORK}/verdicts_${i}.json"
+  go run ./cmd/scan-eval --corpus "$CORPUS" --out "$vf"
+  VERDICT_ARGS+=(--verdicts "$vf")
+done
+echo "Produced ${RUNS} verdict file(s)."
+
+if [ -z "${MCP_EVAL_DIR:-}" ] || [ ! -d "${MCP_EVAL_DIR:-/nonexistent}" ]; then
+  echo "::warning::MCP_EVAL_DIR unset or missing — skipping the SecurityScorer gate (Go-only smoke). Set MCP_EVAL_DIR to a smart-mcp-proxy/mcp-eval checkout to run the full gate."
+  exit 0
+fi
+
+mkdir -p "$OUT_DIR"
+ABS_CORPUS="$(cd "$(dirname "$CORPUS")" && pwd)/$(basename "$CORPUS")"
+ABS_OUT="$(cd "$OUT_DIR" && pwd)"
+log "SecurityScorer gate: fpr-ceiling=${FPR_CEILING} recall-floor=${RECALL_FLOOR} (MCP-815 thresholds)"
+# mcp-eval is run as a module with PYTHONPATH=src (its console-script entry point
+# is not installed by `uv sync`); uv supplies the synced 3.11 interpreter.
+( cd "$MCP_EVAL_DIR" && PYTHONPATH=src uv run python -m mcp_eval.cli security \
+    "${VERDICT_ARGS[@]}" \
+    --corpus "$ABS_CORPUS" \
+    --fpr-ceiling "$FPR_CEILING" \
+    --recall-floor "$RECALL_FLOOR" \
+    --out-dir "$ABS_OUT" )
+
+log "D2 security gate PASSED — reports in $OUT_DIR"
diff --git a/specs/065-evaluation-foundation/datasets/README.md b/specs/065-evaluation-foundation/datasets/README.md
@@ -107,3 +107,36 @@ text from them is vendored into this repo:
 - **`mcp-injection-experiments`** — LICENSE unconfirmed (research.md R-A); where it
   inspired a pattern, the corresponding entry was rewritten from scratch and
   labeled `self-authored`. The corpus test rejects any entry sourced from these.
+
+## CI regression gate (Spec 065 / C1)
+
+`.github/workflows/eval.yml` runs both evaluations as a regression gate
+(FR-009). Two independent jobs keep a network flake in D1 from masking the
+deterministic D2 gate:
+
+- **`security-d2` (D2, blocking)** — Go + Python only, no live upstreams.
+  Provenance/license guard over `security_corpus_v1.json` (FR-007 / CN-005),
+  then `cmd/scan-eval` ×3 → the mcp-eval `SecurityScorer`. Gate thresholds are
+  **`--fpr-ceiling 0.10 --recall-floor 0.05`** (not the scorer defaults: the
+  `sensitive-data` detector measures recall ≈ 0.10 here because most malicious
+  entries are prompt-injection / tool-poisoning / rug-pull, out of scope for a
+  secret/path detector). These thresholds will move to a `security.gate` block
+  in `baseline_v1.json` once MCP-815 lands, so the gate and baseline never drift.
+- **`retrieval-d1` (D1)** — boots `mcpproxy serve` over
+  `snapshot-servers.config.json` (7 reference servers), waits for index
+  readiness, then runs the mcp-eval `RetrievalScorer` with
+  `--baseline baseline_v1.json --tolerance 0.05`. **Report-only on PRs**
+  (npx/uvx fetches are a known flake source), **blocking on the nightly
+  schedule**; promote to PR-blocking after a green soak.
+
+Both jobs upload HTML/JSON reports as run artifacts and the build **never
+commits** them (CN-003). The shared D2 logic lives in `scripts/eval-ci-smoke.sh`
+so the gate runs identically in CI and locally:
+
+```bash
+# Local D2 smoke (full gate needs a mcp-eval checkout):
+MCP_EVAL_DIR=/path/to/mcp-eval bash scripts/eval-ci-smoke.sh
+```
+
+mcp-eval (`smart-mcp-proxy/mcp-eval`, public) is checked out at a pinned ref for
+reproducibility.