diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml new file mode 100644 index 00000000..74727e20 --- /dev/null +++ b/.github/workflows/eval.yml @@ -0,0 +1,198 @@ +name: Eval (Spec 065 regression gate) + +# Spec 065 / C1 (FR-009, US3/P2): regression gate over the frozen Spec-065 +# datasets. Two independent jobs so a network flake in D1 (retrieval) never +# masks or blocks the deterministic D2 (security) gate. +# +# security-d2 — Go + Python only, no live upstreams. HARD gate (blocking). +# retrieval-d1 — needs a live mcpproxy serving 7 reference servers; network +# dependent. Report-only on PRs, blocking on the nightly +# schedule (promote to PR-blocking after a green soak — see the +# plan on MCP-742). Reports are uploaded as artifacts, never +# committed (CN-003). +# +# mcp-eval (smart-mcp-proxy/mcp-eval) is a separate PUBLIC repo, checked out at a +# pinned ref — no token needed. + +on: + pull_request: + paths: + - "cmd/scan-eval/**" + - "internal/security/**" + - "specs/065-evaluation-foundation/datasets/**" + - "scripts/eval-ci-smoke.sh" + - ".github/workflows/eval.yml" + workflow_dispatch: {} + schedule: + # Nightly soak (02:30 UTC) — exercises D1 against live upstreams. + - cron: "30 2 * * *" + +permissions: + contents: read + +env: + MCP_EVAL_REF: "76df3a47e1480bfde2433b4f19df19312c985963" # SecurityScorer (B3) merge — pin for reproducibility + PYTHON_VERSION: "3.11.13" + +jobs: + security-d2: + name: Security regression gate (D2) + runs-on: ubuntu-latest + steps: + - name: Checkout mcpproxy-go + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Set up Go + uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0 + with: + go-version: "1.25" + cache: true + + - name: Checkout mcp-eval (public, pinned) + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + with: + repository: smart-mcp-proxy/mcp-eval + ref: ${{ env.MCP_EVAL_REF }} + path: mcp-eval + + - name: Set up uv + Python + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Sync mcp-eval environment + working-directory: mcp-eval + run: uv sync + + - name: Run D2 security gate + env: + MCP_EVAL_DIR: ${{ github.workspace }}/mcp-eval + OUT_DIR: reports/security + run: bash scripts/eval-ci-smoke.sh + + - name: Upload D2 reports + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: eval-security-d2 + path: reports/security/ + retention-days: 14 + if-no-files-found: ignore + + - name: Assert reports are not committed (CN-003) + if: always() + run: | + tracked="$(git ls-files reports/ || true)" + if [ -n "$tracked" ]; then + echo "::error::Eval reports must never be committed (CN-003). Tracked under reports/:" + echo "$tracked" + exit 1 + fi + echo "OK: no eval reports are tracked by git." + + retrieval-d1: + name: Retrieval regression gate (D1) + runs-on: ubuntu-latest + # Report-only on PRs (D1 depends on npx/uvx package fetches — a known flake + # source); blocking on the nightly schedule. Promote to PR-blocking after a + # green soak (plan on MCP-742). + continue-on-error: ${{ github.event_name == 'pull_request' }} + steps: + - name: Checkout mcpproxy-go + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Set up Go + uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0 + with: + go-version: "1.25" + cache: true + + - name: Set up Node.js (npx-launched servers) + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: "22" + + - name: Checkout mcp-eval (public, pinned) + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + with: + repository: smart-mcp-proxy/mcp-eval + ref: ${{ env.MCP_EVAL_REF }} + path: mcp-eval + + - name: Set up uv + Python (uvx-launched servers + mcp-eval) + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Sync mcp-eval environment + working-directory: mcp-eval + run: uv sync + + - name: Build mcpproxy + run: go build -o mcpproxy ./cmd/mcpproxy + + # Boot, poll and score in ONE step: a server backgrounded in a separate + # step is reaped when that step's shell exits, so start + wait + run must + # share a shell. The trap stops the server however this step ends. + - name: Run D1 retrieval gate (boot mcpproxy + score) + working-directory: ${{ github.workspace }} + env: + DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets + run: | + set -uo pipefail + base="http://127.0.0.1:8092"; key="eval-corpus-snapshot" + # data_dir must exist — `serve` refuses to create a missing one. + mkdir -p "$RUNNER_TEMP/eval" + ./mcpproxy serve \ + --config "$DS/snapshot-servers.config.json" \ + --data-dir "$RUNNER_TEMP/eval" \ + --listen 127.0.0.1:8092 \ + --log-level info > "$RUNNER_TEMP/mcpproxy.log" 2>&1 & + server_pid=$! + trap 'kill "$server_pid" 2>/dev/null || true' EXIT + + # Wait for the FULL tool catalog before scoring: the retrieval index is + # built from the connected servers' tools, and scoring a partially + # indexed instance tanks recall (a ≥1-result check fires far too early). + # The 7 reference servers expose ~45 tools; require near-full + a short + # settle for the index build. /api/v1/tools wraps as + # {"success":true,"data":{"tools":[...]}}. + ready=0 + expected=44 + for i in $(seq 1 60); do + if ! kill -0 "$server_pid" 2>/dev/null; then + echo "::error::mcpproxy process exited during startup" + break + fi + t="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/tools" \ + | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("tools", [])))' 2>/dev/null || echo 0)" + echo "attempt $i: catalog has $t tool(s)" + if [ "$t" -ge "$expected" ]; then + ready=1; echo "Catalog full ($t tools); settling 8s for index build."; sleep 8; break + fi + sleep 5 + done + if [ "$ready" != 1 ]; then + echo "::error::mcpproxy catalog did not reach ${expected} tools in time" + echo "----- mcpproxy.log (tail) -----"; tail -80 "$RUNNER_TEMP/mcpproxy.log" || true + exit 1 + fi + + ( cd "$GITHUB_WORKSPACE/mcp-eval" && PYTHONPATH=src uv run python -m mcp_eval.cli retrieval \ + --corpus "$DS/corpus_v1.tools.json" \ + --golden "$DS/retrieval_golden_v1.json" \ + --baseline "$DS/baseline_v1.json" \ + --tolerance 0.05 \ + --runs 1 \ + --base-url "$base" \ + --api-key "$key" \ + --out-dir "$GITHUB_WORKSPACE/reports/retrieval" ) + + - name: Upload D1 reports + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: eval-retrieval-d1 + path: reports/retrieval/ + retention-days: 14 + if-no-files-found: ignore diff --git a/scripts/eval-ci-smoke.sh b/scripts/eval-ci-smoke.sh new file mode 100755 index 00000000..2f625ef0 --- /dev/null +++ b/scripts/eval-ci-smoke.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# +# eval-ci-smoke.sh — Spec 065 / D2 security regression gate (and local smoke). +# +# Runs the deterministic D2 half of the Spec-065 evaluation end to end: +# 1. provenance/license guard over the security corpus (FR-007 / CN-005), +# 2. scan-eval (this repo) N times -> per-run verdict JSON (FR-010 averaging), +# 3. mcp-eval SecurityScorer gate (P/R/F1/FPR per detector, absolute thresholds). +# +# It is the single source of truth shared by `.github/workflows/eval.yml` (Job A) +# and local pre-flight, so the gate logic is proven the same way in both places. +# +# Exit non-zero if the corpus fails the provenance guard, scan-eval fails, or the +# SecurityScorer gate fails (FPR above ceiling / recall below floor). +# +# Config via env (all have CI-friendly defaults; paths may be relative to repo root): +# DATASETS_DIR dataset directory (default: specs/065-evaluation-foundation/datasets) +# MCP_EVAL_DIR checkout of smart-mcp-proxy/mcp-eval. If unset/missing, the +# SecurityScorer step is SKIPPED (Go-only smoke) with a warning. +# OUT_DIR report output dir (default: reports/security) — never committed (CN-003) +# RUNS number of scan-eval runs to average (default: 3) +# FPR_CEILING max allowed per-detector false-positive rate (default: 0.10) # matches MCP-815 +# RECALL_FLOOR min allowed detector recall (default: 0.05) # matches MCP-815 +# +# Threshold provenance (critical): the SecurityScorer *defaults* are recall-floor +# 0.80, but the production `sensitive-data` detector measures recall ~= 0.10 on +# this corpus (most malicious entries are prompt-injection / tool-poisoning / +# rug-pull — out of scope for a secret/path detector). The gate therefore uses +# the MCP-815 thresholds below, not the defaults. Once MCP-815 lands a +# `security.gate` block in baseline_v1.json, source these from there so the gate +# and the baseline never drift. +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$REPO_ROOT" + +DATASETS_DIR="${DATASETS_DIR:-specs/065-evaluation-foundation/datasets}" +OUT_DIR="${OUT_DIR:-reports/security}" +RUNS="${RUNS:-3}" +FPR_CEILING="${FPR_CEILING:-0.10}" # matches MCP-815 +RECALL_FLOOR="${RECALL_FLOOR:-0.05}" # matches MCP-815 +CORPUS="${DATASETS_DIR}/security_corpus_v1.json" + +ALLOWED_LICENSES="MIT Apache-2.0 BSD-3-Clause CC0-1.0 self-authored" + +log() { printf '\n\033[1;34m==>\033[0m %s\n' "$*"; } + +[ -f "$CORPUS" ] || { echo "error: corpus not found: $CORPUS" >&2; exit 4; } + +log "Provenance / license guard (FR-007 / CN-005) over $CORPUS" +ALLOWED_LICENSES="$ALLOWED_LICENSES" python3 - "$CORPUS" <<'PY' +import json, os, sys +corpus = json.load(open(sys.argv[1])) +entries = corpus["entries"] if isinstance(corpus, dict) else corpus +allowed = set(os.environ["ALLOWED_LICENSES"].split()) +bad = [] +for e in entries: + prov = e.get("provenance") or {} + lic = prov.get("license") + if not e.get("category"): + bad.append(f'{e.get("id","?")}: missing category') + if not prov.get("source"): + bad.append(f'{e.get("id","?")}: missing provenance.source') + if lic not in allowed: + bad.append(f'{e.get("id","?")}: license {lic!r} not in allowlist {sorted(allowed)}') +if bad: + print("PROVENANCE GUARD FAILED:", file=sys.stderr) + for b in bad: + print(" -", b, file=sys.stderr) + sys.exit(5) +print(f"OK: {len(entries)} entries, all carry category + allowlisted provenance.license") +PY + +WORK="$(mktemp -d)" +trap 'rm -rf "$WORK"' EXIT +VERDICT_ARGS=() +log "Running scan-eval x${RUNS} (deterministic detector; N for FR-010 averaging contract)" +for i in $(seq 1 "$RUNS"); do + vf="${WORK}/verdicts_${i}.json" + go run ./cmd/scan-eval --corpus "$CORPUS" --out "$vf" + VERDICT_ARGS+=(--verdicts "$vf") +done +echo "Produced ${RUNS} verdict file(s)." + +if [ -z "${MCP_EVAL_DIR:-}" ] || [ ! -d "${MCP_EVAL_DIR:-/nonexistent}" ]; then + echo "::warning::MCP_EVAL_DIR unset or missing — skipping the SecurityScorer gate (Go-only smoke). Set MCP_EVAL_DIR to a smart-mcp-proxy/mcp-eval checkout to run the full gate." + exit 0 +fi + +mkdir -p "$OUT_DIR" +ABS_CORPUS="$(cd "$(dirname "$CORPUS")" && pwd)/$(basename "$CORPUS")" +ABS_OUT="$(cd "$OUT_DIR" && pwd)" +log "SecurityScorer gate: fpr-ceiling=${FPR_CEILING} recall-floor=${RECALL_FLOOR} (MCP-815 thresholds)" +# mcp-eval is run as a module with PYTHONPATH=src (its console-script entry point +# is not installed by `uv sync`); uv supplies the synced 3.11 interpreter. +( cd "$MCP_EVAL_DIR" && PYTHONPATH=src uv run python -m mcp_eval.cli security \ + "${VERDICT_ARGS[@]}" \ + --corpus "$ABS_CORPUS" \ + --fpr-ceiling "$FPR_CEILING" \ + --recall-floor "$RECALL_FLOOR" \ + --out-dir "$ABS_OUT" ) + +log "D2 security gate PASSED — reports in $OUT_DIR" diff --git a/specs/065-evaluation-foundation/datasets/README.md b/specs/065-evaluation-foundation/datasets/README.md index 2fc616c1..5b819a71 100644 --- a/specs/065-evaluation-foundation/datasets/README.md +++ b/specs/065-evaluation-foundation/datasets/README.md @@ -107,3 +107,36 @@ text from them is vendored into this repo: - **`mcp-injection-experiments`** — LICENSE unconfirmed (research.md R-A); where it inspired a pattern, the corresponding entry was rewritten from scratch and labeled `self-authored`. The corpus test rejects any entry sourced from these. + +## CI regression gate (Spec 065 / C1) + +`.github/workflows/eval.yml` runs both evaluations as a regression gate +(FR-009). Two independent jobs keep a network flake in D1 from masking the +deterministic D2 gate: + +- **`security-d2` (D2, blocking)** — Go + Python only, no live upstreams. + Provenance/license guard over `security_corpus_v1.json` (FR-007 / CN-005), + then `cmd/scan-eval` ×3 → the mcp-eval `SecurityScorer`. Gate thresholds are + **`--fpr-ceiling 0.10 --recall-floor 0.05`** (not the scorer defaults: the + `sensitive-data` detector measures recall ≈ 0.10 here because most malicious + entries are prompt-injection / tool-poisoning / rug-pull, out of scope for a + secret/path detector). These thresholds will move to a `security.gate` block + in `baseline_v1.json` once MCP-815 lands, so the gate and baseline never drift. +- **`retrieval-d1` (D1)** — boots `mcpproxy serve` over + `snapshot-servers.config.json` (7 reference servers), waits for index + readiness, then runs the mcp-eval `RetrievalScorer` with + `--baseline baseline_v1.json --tolerance 0.05`. **Report-only on PRs** + (npx/uvx fetches are a known flake source), **blocking on the nightly + schedule**; promote to PR-blocking after a green soak. + +Both jobs upload HTML/JSON reports as run artifacts and the build **never +commits** them (CN-003). The shared D2 logic lives in `scripts/eval-ci-smoke.sh` +so the gate runs identically in CI and locally: + +```bash +# Local D2 smoke (full gate needs a mcp-eval checkout): +MCP_EVAL_DIR=/path/to/mcp-eval bash scripts/eval-ci-smoke.sh +``` + +mcp-eval (`smart-mcp-proxy/mcp-eval`, public) is checked out at a pinned ref for +reproducibility.