diffscope/.github/workflows/eval.yml at main · evalops/diffscope · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
name: Eval Quality

on:
  pull_request:
    paths:
      - 'src/**'
      - 'eval/**'
      - '.github/workflows/eval.yml'
      - 'Cargo.toml'
      - 'Cargo.lock'
  workflow_dispatch:

permissions:
  contents: read

env:
  CARGO_TERM_COLOR: always

jobs:
  eval:
    runs-on: ubuntu-latest
    timeout-minutes: 60
    steps:
      - name: Check eval secret
        id: secret-check
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: |
          if [ -n "${OPENAI_API_KEY}" ]; then
            echo "configured=true" >> "$GITHUB_OUTPUT"
          else
            echo "configured=false" >> "$GITHUB_OUTPUT"
          fi

      - uses: actions/checkout@v5
        if: ${{ steps.secret-check.outputs.configured == 'true' }}
        with:
          fetch-depth: 0

      - uses: dtolnay/rust-toolchain@1.88.0
        if: ${{ steps.secret-check.outputs.configured == 'true' }}
      - uses: Swatinem/rust-cache@v2
        if: ${{ steps.secret-check.outputs.configured == 'true' }}

      - name: Build current branch binary
        if: ${{ steps.secret-check.outputs.configured == 'true' }}
        run: cargo build --release

      - name: Build baseline report from origin/main
        if: ${{ steps.secret-check.outputs.configured == 'true' }}
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: |
          git fetch origin main --depth=1
          git worktree add /tmp/diffscope-main origin/main
          cd /tmp/diffscope-main
          cargo build --release
          ./target/release/diffscope eval \
            --model gpt-4o-mini \
            --temperature 0 \
            --fixtures eval/fixtures \
            --output /tmp/eval-baseline.json

      - name: Run eval thresholds on current branch
        if: ${{ steps.secret-check.outputs.configured == 'true' }}
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: |
          ./target/release/diffscope eval \
            --model gpt-4o-mini \
            --temperature 0 \
            --fixtures eval/fixtures \
            --output eval-current.json \
            --baseline /tmp/eval-baseline.json \
            --max-micro-f1-drop 0.20 \
            --min-micro-f1 0.20 \
            --min-verification-health 0.80 \
            --min-rule-f1 sec.shell.injection=0.10 \
            --min-rule-f1 reliability.unwrap_panic=0.10 \
            --max-rule-f1-drop sec.shell.injection=0.25 \
            --max-rule-f1-drop reliability.unwrap_panic=0.25

      - name: Upload eval reports
        if: ${{ always() && steps.secret-check.outputs.configured == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          name: eval-reports
          path: |
            eval-current.json
            /tmp/eval-baseline.json

      - name: Skip message
        if: ${{ steps.secret-check.outputs.configured != 'true' }}
        run: echo "Skipping eval workflow because OPENAI_API_KEY secret is not configured."