Skip to content

Commit 1252b8d

Browse files
ericyangpanclaude
andcommitted
feat: implement benchmark infrastructure for AI models
Add comprehensive benchmark tracking system: - Add benchmarks field to model schema supporting 7 benchmark types - Create benchmark utility library for formatting and validation - Implement automated benchmark fetcher using Playwright - Scrapes SWE-bench, TerminalBench, MMMU, SciCode, LiveCodeBench - Smart model name matching with fallback strategies - Parallel fetching with retry logic - Add weekly CI workflow for automated benchmark updates - Install playwright dependency for web scraping - Add fetch:benchmarks npm script Supported benchmarks: - SWE-bench Verified: Real-world software engineering tasks - TerminalBench 2.0: Command-line interface proficiency - MMMU/MMMU Pro: Multimodal understanding - WebDevArena: Web development challenges - SciCode: Scientific code generation - LiveCodeBench: Live coding performance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent d708a34 commit 1252b8d

File tree

7 files changed

+982
-0
lines changed

7 files changed

+982
-0
lines changed
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
name: Update Benchmarks
2+
3+
on:
4+
schedule:
5+
# Run every Monday at 08:00 UTC
6+
- cron: '0 8 * * 1'
7+
workflow_dispatch: # Allow manual triggering
8+
9+
permissions:
10+
contents: write
11+
pull-requests: write
12+
13+
jobs:
14+
update-benchmarks:
15+
name: Update Benchmark Scores
16+
runs-on: ubuntu-latest
17+
18+
steps:
19+
- name: Checkout code
20+
uses: actions/checkout@v6
21+
22+
- name: Setup Node.js
23+
uses: actions/setup-node@v6
24+
with:
25+
node-version: '22'
26+
cache: 'npm'
27+
28+
- name: Install dependencies
29+
run: npm ci
30+
31+
- name: Install Playwright browsers
32+
run: npx playwright install chromium
33+
34+
- name: Fetch benchmark data
35+
run: npm run fetch:benchmarks
36+
timeout-minutes: 30
37+
continue-on-error: false
38+
39+
- name: Check for changes
40+
id: git-check
41+
run: |
42+
git diff --exit-code manifests/models/ || echo "changed=true" >> $GITHUB_OUTPUT
43+
44+
- name: Create pull request
45+
if: steps.git-check.outputs.changed == 'true'
46+
uses: peter-evans/create-pull-request@v7
47+
with:
48+
token: ${{ secrets.GITHUB_TOKEN }}
49+
commit-message: 'chore: update model benchmark scores'
50+
title: '📊 Update Model Benchmark Scores'
51+
body: |
52+
## Automated Benchmark Data Update
53+
54+
This PR updates model benchmark scores from the following sources:
55+
- [SWE-bench](https://www.swebench.com)
56+
- [TerminalBench 2.0](https://www.tbench.ai/leaderboard/terminal-bench/2.0)
57+
- [MMMU](https://mmmu-benchmark.github.io/#leaderboard)
58+
- [MMMU Pro](https://mmmu-benchmark.github.io/#leaderboard)
59+
- [WebDevArena](https://web.lmarena.ai/leaderboard)
60+
- [SciCode](https://scicode-bench.github.io/leaderboard/)
61+
- [LiveCodeBench](https://livecodebench.github.io/leaderboard.html)
62+
63+
### Review Checklist
64+
- [ ] Verify scores look reasonable (no negative values, within expected ranges)
65+
- [ ] Check that new models are properly identified
66+
- [ ] No unintended changes to other manifest fields
67+
- [ ] Review workflow logs for any fetch failures or warnings
68+
69+
---
70+
*This PR was automatically created by the weekly benchmark update workflow.*
71+
branch: automated/update-benchmarks
72+
delete-branch: true
73+
labels: |
74+
automated
75+
metadata
76+
benchmarks
77+
78+
workflow-summary:
79+
name: Workflow Summary
80+
runs-on: ubuntu-latest
81+
needs: [update-benchmarks]
82+
if: always()
83+
steps:
84+
- name: Summary
85+
run: |
86+
echo "## Benchmark Update Summary" >> $GITHUB_STEP_SUMMARY
87+
echo "" >> $GITHUB_STEP_SUMMARY
88+
echo "- Update Status: ${{ needs.update-benchmarks.result }}" >> $GITHUB_STEP_SUMMARY

manifests/$schemas/model.schema.json

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,41 @@
4545
"platformUrls": {
4646
"$ref": "./ref/platform-urls.schema.json",
4747
"description": "URLs to various third-party platform pages for this model"
48+
},
49+
"benchmarks": {
50+
"type": "object",
51+
"description": "Performance scores from coding and multimodal benchmarks",
52+
"properties": {
53+
"sweBench": {
54+
"type": ["number", "null"],
55+
"description": "SWE-bench Verified score (percentage, e.g., 74.4)"
56+
},
57+
"terminalBench": {
58+
"type": ["number", "null"],
59+
"description": "TerminalBench 2.0 accuracy score (decimal, e.g., 0.604)"
60+
},
61+
"mmmu": {
62+
"type": ["number", "null"],
63+
"description": "MMMU benchmark score (percentage)"
64+
},
65+
"mmmuPro": {
66+
"type": ["number", "null"],
67+
"description": "MMMU Pro benchmark score (percentage)"
68+
},
69+
"webDevArena": {
70+
"type": ["number", "null"],
71+
"description": "WebDevArena score"
72+
},
73+
"sciCode": {
74+
"type": ["number", "null"],
75+
"description": "SciCode benchmark score (percentage)"
76+
},
77+
"liveCodeBench": {
78+
"type": ["number", "null"],
79+
"description": "LiveCodeBench Pass@1 score (percentage)"
80+
}
81+
},
82+
"additionalProperties": false
4883
}
4984
},
5085
"required": ["size", "totalContext", "maxOutput", "tokenPricing", "platformUrls"]

package-lock.json

Lines changed: 51 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
"refactor:sort-fields": "node scripts/refactor/index.mjs sort-manifest-fields",
2929
"fetch": "node scripts/fetch/index.mjs",
3030
"fetch:github-stars": "node scripts/fetch/index.mjs github-stars",
31+
"fetch:benchmarks": "node scripts/fetch/index.mjs benchmarks",
3132
"deploy": "npm run build:next && opennextjs-cloudflare build --skipBuild && opennextjs-cloudflare deploy",
3233
"preview": "npm run build:next && opennextjs-cloudflare build --skipBuild && opennextjs-cloudflare preview",
3334
"cf-typegen": "wrangler types --env-interface CloudflareEnv ./cloudflare-env.d.ts",
@@ -67,6 +68,7 @@
6768
"gray-matter": "^4.0.3",
6869
"husky": "9.1.7",
6970
"lint-staged": "16.2.7",
71+
"playwright": "^1.48.0",
7072
"rehype-highlight": "^7.0.2",
7173
"remark-frontmatter": "^5.0.0",
7274
"remark-gfm": "^4.0.1",

0 commit comments

Comments
 (0)