-
Notifications
You must be signed in to change notification settings - Fork 2
94 lines (83 loc) · 2.91 KB
/
eval.yml
File metadata and controls
94 lines (83 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
name: Eval Quality
on:
pull_request:
paths:
- 'src/**'
- 'eval/**'
- '.github/workflows/eval.yml'
- 'Cargo.toml'
- 'Cargo.lock'
workflow_dispatch:
permissions:
contents: read
env:
CARGO_TERM_COLOR: always
jobs:
eval:
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- name: Check eval secret
id: secret-check
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
if [ -n "${OPENAI_API_KEY}" ]; then
echo "configured=true" >> "$GITHUB_OUTPUT"
else
echo "configured=false" >> "$GITHUB_OUTPUT"
fi
- uses: actions/checkout@v5
if: ${{ steps.secret-check.outputs.configured == 'true' }}
with:
fetch-depth: 0
- uses: dtolnay/rust-toolchain@1.88.0
if: ${{ steps.secret-check.outputs.configured == 'true' }}
- uses: Swatinem/rust-cache@v2
if: ${{ steps.secret-check.outputs.configured == 'true' }}
- name: Build current branch binary
if: ${{ steps.secret-check.outputs.configured == 'true' }}
run: cargo build --release
- name: Build baseline report from origin/main
if: ${{ steps.secret-check.outputs.configured == 'true' }}
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
git fetch origin main --depth=1
git worktree add /tmp/diffscope-main origin/main
cd /tmp/diffscope-main
cargo build --release
./target/release/diffscope eval \
--model gpt-4o-mini \
--temperature 0 \
--fixtures eval/fixtures \
--output /tmp/eval-baseline.json
- name: Run eval thresholds on current branch
if: ${{ steps.secret-check.outputs.configured == 'true' }}
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
./target/release/diffscope eval \
--model gpt-4o-mini \
--temperature 0 \
--fixtures eval/fixtures \
--output eval-current.json \
--baseline /tmp/eval-baseline.json \
--max-micro-f1-drop 0.20 \
--min-micro-f1 0.20 \
--min-verification-health 0.80 \
--min-rule-f1 sec.shell.injection=0.10 \
--min-rule-f1 reliability.unwrap_panic=0.10 \
--max-rule-f1-drop sec.shell.injection=0.25 \
--max-rule-f1-drop reliability.unwrap_panic=0.25
- name: Upload eval reports
if: ${{ always() && steps.secret-check.outputs.configured == 'true' }}
uses: actions/upload-artifact@v4
with:
name: eval-reports
path: |
eval-current.json
/tmp/eval-baseline.json
- name: Skip message
if: ${{ steps.secret-check.outputs.configured != 'true' }}
run: echo "Skipping eval workflow because OPENAI_API_KEY secret is not configured."