From 8e06d58b9bd6eba4c5f1bd11cef280586d65cf56 Mon Sep 17 00:00:00 2001 From: Mladen Todorovic Date: Thu, 2 Apr 2026 12:26:01 +0200 Subject: [PATCH] Add weekly job for model evaluation doc update --- .github/workflows/model-evaluation.yml | 131 +++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 .github/workflows/model-evaluation.yml diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml new file mode 100644 index 0000000..a72565c --- /dev/null +++ b/.github/workflows/model-evaluation.yml @@ -0,0 +1,131 @@ +name: Weekly Model Evaluation + +on: + schedule: + - cron: '0 6 * * 1' # Every Monday at 6:00 UTC + workflow_dispatch: + inputs: + models: + description: 'Comma-separated list of model IDs to evaluate' + required: false + default: 'gpt-5-mini' + +concurrency: + group: model-evaluation + cancel-in-progress: true + +jobs: + prepare: + name: Prepare Model Matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + date: ${{ steps.set-date.outputs.date }} + steps: + - name: Set date + id: set-date + run: echo "date=$(date +%Y-%m-%d)" >> "$GITHUB_OUTPUT" + + - name: Set matrix + id: set-matrix + run: | + MODELS="${{ inputs.models || 'gpt-5-mini' }}" + MATRIX=$(echo "$MODELS" | tr ',' '\n' | jq -R . | jq -sc '{"model": .}') + echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT" + + evaluate: + name: Evaluate ${{ matrix.model }} + needs: prepare + runs-on: ubuntu-latest + strategy: + matrix: ${{ fromJson(needs.prepare.outputs.matrix) }} + fail-fast: false + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Setup proto files + run: make proto-setup + + - name: Generate proto descriptors + run: make proto-generate + + - name: Download WireMock + run: make mock-download + + - name: Run E2E tests + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + MODEL_KEY: ${{ secrets.OPENAI_API_KEY }} + MODEL_NAME: "openai:${{ matrix.model }}" + run: make e2e-test + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-results-${{ matrix.model }} + path: e2e-tests/mcpchecker/mcpchecker-stackrox-mcp-e2e-out.json + if-no-files-found: error + + update-docs: + name: Update Documentation & Create PR + needs: [prepare, evaluate] + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Download all results + uses: actions/download-artifact@v8 + with: + pattern: eval-results-* + path: eval-results + + - name: Update model evaluation docs + run: | + MODELS="${{ inputs.models || 'gpt-5-mini' }}" + for MODEL in $(echo "$MODELS" | tr ',' ' '); do + RESULTS_FILE="eval-results/eval-results-${MODEL}/mcpchecker-stackrox-mcp-e2e-out.json" + if [ -f "$RESULTS_FILE" ]; then + echo "Updating docs for model: ${MODEL}" + ./scripts/update-model-evaluation.sh \ + --model-id "${MODEL}" \ + --results "${RESULTS_FILE}" + else + echo "::warning:: No results found for model ${MODEL}" + fi + done + + - name: Check for changes + id: check-changes + run: | + if git diff --quiet docs/model-evaluation.md; then + echo "changed=false" >> "$GITHUB_OUTPUT" + else + echo "changed=true" >> "$GITHUB_OUTPUT" + fi + + - name: Create Pull Request + if: steps.check-changes.outputs.changed == 'true' + uses: peter-evans/create-pull-request@v7 + with: + branch: chore/update-model-evaluation-${{ needs.prepare.outputs.date }} + commit-message: "Update model evaluations ${{ needs.prepare.outputs.date }}" + title: "chore(evals): Update model evaluations ${{ needs.prepare.outputs.date }}" + body: | + Automated weekly model evaluation update. + + **Models evaluated:** ${{ inputs.models || 'gpt-5-mini' }} + **Date:** ${{ needs.prepare.outputs.date }} + + This PR was automatically generated by the [Model Evaluation workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}). + base: main