From b8366e604d3f75ef0e86666d80d28944632db6ec Mon Sep 17 00:00:00 2001 From: Mladen Todorovic Date: Tue, 31 Mar 2026 17:00:58 +0200 Subject: [PATCH] Add evaluation docs and script to update it --- docs/model-evaluation.md | 133 +++++++++++++++++++++++ scripts/update-model-evaluation.sh | 163 +++++++++++++++++++++++++++++ 2 files changed, 296 insertions(+) create mode 100644 docs/model-evaluation.md create mode 100755 scripts/update-model-evaluation.sh diff --git a/docs/model-evaluation.md b/docs/model-evaluation.md new file mode 100644 index 0000000..756aea0 --- /dev/null +++ b/docs/model-evaluation.md @@ -0,0 +1,133 @@ +# LLM Model Evaluation Results + +## Overview + +This document tracks evaluation results of LLM models used with the StackRox MCP server. Evaluations measure how well a model selects the correct MCP tools, passes appropriate parameters, stays within expected tool call bounds, and produces accurate responses. + +All evaluations use the [mcpchecker](https://github.com/mcpchecker/mcpchecker) framework against a deterministic WireMock-based mock backend, ensuring reproducible results across runs. + +## Evaluation Methodology + +### Test Framework + +Evaluations are run using **mcpchecker**, configured in [`e2e-tests/mcpchecker/eval.yaml`](../e2e-tests/mcpchecker/eval.yaml). The framework: + +1. Sends a natural language prompt to the model under test +2. The model interacts with the MCP server (tool calls, parameter selection) +3. Assertions validate tool usage against expected behavior +4. An LLM judge evaluates response quality against reference answers + +### Test Environment + +- **Backend**: WireMock mock server with deterministic fixtures (no live StackRox Central required) +- **MCP Config**: [`e2e-tests/mcpchecker/mcp-config-mock.yaml`](../e2e-tests/mcpchecker/mcp-config-mock.yaml) +- **Task definitions**: [`e2e-tests/mcpchecker/tasks/`](../e2e-tests/mcpchecker/tasks/) + +### Assertions + +Each task defines assertions from the following set: + +| Assertion | Description | +|-----------|-------------| +| `toolsUsed` | Required tool(s) must be called, optionally with matching arguments (`argumentsMatch`) | +| `minToolCalls` | Minimum total tool calls across all tools | +| `maxToolCalls` | Maximum total tool calls (prevents runaway tool usage) | + +A task passes when **all** its assertions pass **and** the LLM judge approves the response. + +## Evaluation Results + + + +### gpt-5-mini — 2026-03-31 + +**Overall: 10/11 tasks passed (90%)** + +#### Task Results + +| # | Task | Result | toolsUsed | minCalls | maxCalls | Input Tokens | Output Tokens | +|---|------|--------|-----------|----------|----------|--------------|---------------| +| 1 | list-clusters | Pass | Pass | Pass | Pass | 1728 | 962 | +| 2 | cve-detected-workloads | Pass | Pass | Pass | Pass | 565 | 1187 | +| 3 | cve-detected-clusters | Pass | **Fail** | Pass | Pass | 640 | 1998 | +| 4 | cve-nonexistent | Pass | Pass | Pass | Pass | 1077 | 2605 | +| 5 | cve-cluster-does-exist | **Fail** | Pass | Pass | Pass | 539 | 1285 | +| 6 | cve-cluster-does-not-exist | Pass | **Fail** | Pass | Pass | 1528 | 1324 | +| 7 | cve-clusters-general | Pass | Pass | Pass | Pass | 796 | 2304 | +| 8 | cve-cluster-list | Pass | Pass | Pass | Pass | 488 | 1917 | +| 9 | cve-log4shell | Pass | Pass | Pass | Pass | 1008 | 2936 | +| 10 | cve-multiple | Pass | Pass | Pass | Pass | 1142 | 2493 | +| 11 | rhsa-not-supported | Pass | — | Pass | Pass | 650 | 2488 | + +**Total input tokens**: 10161 | **Total output tokens**: 21499 + + + + + +### gpt-5 — 2026-03-31 + +**Overall: 9/11 tasks passed (81%)** + +#### Task Results + +| # | Task | Result | toolsUsed | minCalls | maxCalls | Input Tokens | Output Tokens | +|---|------|--------|-----------|----------|----------|--------------|---------------| +| 1 | list-clusters | Pass | Pass | Pass | Pass | 1720 | 552 | +| 2 | cve-detected-workloads | Pass | Pass | Pass | Pass | 1589 | 1003 | +| 3 | cve-detected-clusters | Pass | Pass | Pass | Pass | 521 | 1702 | +| 4 | cve-nonexistent | **Fail** | Pass | Pass | Pass | 2406 | 2085 | +| 5 | cve-cluster-does-exist | Pass | Pass | Pass | Pass | 1563 | 1682 | +| 6 | cve-cluster-does-not-exist | **Fail** | **Fail** | Pass | Pass | 504 | 1868 | +| 7 | cve-clusters-general | Pass | Pass | Pass | Pass | 516 | 1477 | +| 8 | cve-cluster-list | Pass | Pass | Pass | Pass | 706 | 1964 | +| 9 | cve-log4shell | Pass | Pass | Pass | Pass | 1008 | 2304 | +| 10 | cve-multiple | Pass | Pass | Pass | Pass | 2166 | 2492 | +| 11 | rhsa-not-supported | Pass | — | Pass | Pass | 818 | 2187 | + +**Total input tokens**: 13517 | **Total output tokens**: 19316 + + + +## How to Run Evaluations + +### Prerequisites + +- Go 1.25+ +- LLM judge credentials configured via environment variables (see below) + +### Running an Evaluation + +1. **Configure the agent model** via environment variable or in `e2e-tests/mcpchecker/eval.yaml`: + + ```bash + export MODEL_NAME=gpt-5-nano + ``` + +2. **Set judge environment variables**: + + ```bash + export JUDGE_TYPE=openai + export JUDGE_API_KEY= + export JUDGE_MODEL_NAME= + ``` + +3. **Run the evaluation**: + + ```bash + make e2e-test + ``` + +4. **Update this document** with the results: + + ```bash + ./scripts/update-model-evaluation.sh \ + --model-id \ + --results e2e-tests/mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json + ``` + + The script generates a markdown section with the task results table and + inserts or updates it in this document using HTML comment markers. + + If results for the given `--model-id` already exist, the script replaces + the existing section. Otherwise, it appends a new section. diff --git a/scripts/update-model-evaluation.sh b/scripts/update-model-evaluation.sh new file mode 100755 index 0000000..fa6e032 --- /dev/null +++ b/scripts/update-model-evaluation.sh @@ -0,0 +1,163 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(dirname "${SCRIPT_DIR}")" +DOC_FILE="${ROOT_DIR}/docs/model-evaluation.md" + +# Validate required tools first +if ! command -v jq &> /dev/null; then + echo "Error: jq is required but not installed" + exit 1 +fi + +usage() { + echo "Usage: $0 --model-id --results " + echo "" + echo "Update docs/model-evaluation.md with evaluation results from mcpchecker JSON output." + echo "" + echo "Options:" + echo " --model-id Model identifier (e.g. gpt-5-mini)" + echo " --results Path to mcpchecker JSON results file" + echo " -h, --help Show this help message" + echo "" + echo "Examples:" + echo " $0 --model-id gpt-5 --results e2e-tests/mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json" + exit 1 +} + +MODEL_ID="" +RESULTS_FILE="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --model-id) + MODEL_ID="$2" + shift 2 + ;; + --results) + RESULTS_FILE="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + echo "Error: unknown option '$1'" + usage + ;; + esac +done + +if [[ -z "${MODEL_ID}" ]]; then + echo "Error: --model-id is required" + usage +fi + +if [[ -z "${RESULTS_FILE}" ]]; then + echo "Error: --results is required" + usage +fi + +if [[ ! -f "${RESULTS_FILE}" ]]; then + echo "Error: results file not found: ${RESULTS_FILE}" + exit 1 +fi + +if [[ ! -f "${DOC_FILE}" ]]; then + echo "Error: documentation file not found: ${DOC_FILE}" + exit 1 +fi + +TODAY=$(date +%Y-%m-%d) +START_MARKER="" +END_MARKER="" + +# Generate the markdown block +generate_block() { + local total passed + total=$(jq 'length' "${RESULTS_FILE}") + passed=$(jq '[.[] | select(.taskPassed == true)] | length' "${RESULTS_FILE}") + local pct=$((100 * passed / total)) + + echo "${START_MARKER}" + echo "" + echo "### ${MODEL_ID} — ${TODAY}" + echo "" + echo "**Overall: ${passed}/${total} tasks passed (${pct}%)**" + echo "" + echo "#### Task Results" + echo "" + echo "| # | Task | Result | toolsUsed | minCalls | maxCalls | Input Tokens | Output Tokens |" + echo "|---|------|--------|-----------|----------|----------|--------------|---------------|" + + # Generate table rows + jq -r ' + to_entries[] | + .key as $i | + .value | + ($i + 1) as $num | + .taskName as $name | + (if .taskPassed then "Pass" else "**Fail**" end) as $result | + (.assertionResults.toolsUsed // null) as $tu | + (.assertionResults.minToolCalls // null) as $min | + (.assertionResults.maxToolCalls // null) as $max | + (if $tu == null then "\u2014" + elif $tu.passed then "Pass" + else "**Fail**" + end) as $tuStr | + (if $min == null then "\u2014" + elif $min.passed then "Pass" + else "**Fail**" + end) as $minStr | + (if $max == null then "\u2014" + elif $max.passed then "Pass" + else "**Fail**" + end) as $maxStr | + (.tokenEstimate.inputTokens) as $inputTokens | + (.tokenEstimate.outputTokens) as $outputTokens | + "| \($num) | \($name) | \($result) | \($tuStr) | \($minStr) | \($maxStr) | \($inputTokens) | \($outputTokens) |" + ' "${RESULTS_FILE}" + + echo "" + + # Token totals + local input_tokens output_tokens + input_tokens=$(jq '[.[].tokenEstimate.inputTokens] | add' "${RESULTS_FILE}") + output_tokens=$(jq '[.[].tokenEstimate.outputTokens] | add' "${RESULTS_FILE}") + echo "**Total input tokens**: ${input_tokens} | **Total output tokens**: ${output_tokens}" + echo "" + echo "${END_MARKER}" +} + +BLOCKFILE=$(mktemp) +TMPFILE=$(mktemp) +cleanup() { rm -f "${BLOCKFILE}" "${TMPFILE}"; } +trap cleanup EXIT + +# shellcheck disable=SC2311 +generate_block > "${BLOCKFILE}" + +if grep -qF "${START_MARKER}" "${DOC_FILE}"; then + # Update existing block: replace lines between markers (inclusive) with new block + awk -v start="${START_MARKER}" -v end="${END_MARKER}" -v blockfile="${BLOCKFILE}" ' + $0 == start { skip=1; while ((getline line < blockfile) > 0) print line; next } + $0 == end { skip=0; next } + !skip { print } + ' "${DOC_FILE}" > "${TMPFILE}" + mv "${TMPFILE}" "${DOC_FILE}" + + echo "Updated existing results for ${MODEL_ID} in ${DOC_FILE}" +else + # Insert new block before "## How to Run Evaluations" + awk -v blockfile="${BLOCKFILE}" ' + /^## How to Run Evaluations/ { + while ((getline line < blockfile) > 0) print line + print "" + } + { print } + ' "${DOC_FILE}" > "${TMPFILE}" + mv "${TMPFILE}" "${DOC_FILE}" + + echo "Added new results for ${MODEL_ID} to ${DOC_FILE}" +fi