aicodingstack
diff --git a/‎.github/workflows/update-benchmarks.yml‎
Lines changed: 88 additions & 0 deletions b/‎.github/workflows/update-benchmarks.yml‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎manifests/$schemas/model.schema.json‎
Lines changed: 35 additions & 0 deletions b/‎manifests/$schemas/model.schema.json‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎package-lock.json‎
Lines changed: 51 additions & 0 deletions b/‎package-lock.json‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎package.json‎
Lines changed: 2 additions & 0 deletions b/‎package.json‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,88 @@
+name: Update Benchmarks
+
+on:
+  schedule:
+    # Run every Monday at 08:00 UTC
+    - cron: '0 8 * * 1'
+  workflow_dispatch: # Allow manual triggering
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  update-benchmarks:
+    name: Update Benchmark Scores
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: '22'
+          cache: 'npm'
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Install Playwright browsers
+        run: npx playwright install chromium
+
+      - name: Fetch benchmark data
+        run: npm run fetch:benchmarks
+        timeout-minutes: 30
+        continue-on-error: false
+
+      - name: Check for changes
+        id: git-check
+        run: |
+          git diff --exit-code manifests/models/ || echo "changed=true" >> $GITHUB_OUTPUT
+
+      - name: Create pull request
+        if: steps.git-check.outputs.changed == 'true'
+        uses: peter-evans/create-pull-request@v7
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          commit-message: 'chore: update model benchmark scores'
+          title: '📊 Update Model Benchmark Scores'
+          body: |
+            ## Automated Benchmark Data Update
+
+            This PR updates model benchmark scores from the following sources:
+            - [SWE-bench](https://www.swebench.com)
+            - [TerminalBench 2.0](https://www.tbench.ai/leaderboard/terminal-bench/2.0)
+            - [MMMU](https://mmmu-benchmark.github.io/#leaderboard)
+            - [MMMU Pro](https://mmmu-benchmark.github.io/#leaderboard)
+            - [WebDevArena](https://web.lmarena.ai/leaderboard)
+            - [SciCode](https://scicode-bench.github.io/leaderboard/)
+            - [LiveCodeBench](https://livecodebench.github.io/leaderboard.html)
+
+            ### Review Checklist
+            - [ ] Verify scores look reasonable (no negative values, within expected ranges)
+            - [ ] Check that new models are properly identified
+            - [ ] No unintended changes to other manifest fields
+            - [ ] Review workflow logs for any fetch failures or warnings
+
+            ---
+            *This PR was automatically created by the weekly benchmark update workflow.*
+          branch: automated/update-benchmarks
+          delete-branch: true
+          labels: |
+            automated
+            metadata
+            benchmarks
+
+  workflow-summary:
+    name: Workflow Summary
+    runs-on: ubuntu-latest
+    needs: [update-benchmarks]
+    if: always()
+    steps:
+      - name: Summary
+        run: |
+          echo "## Benchmark Update Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "- Update Status: ${{ needs.update-benchmarks.result }}" >> $GITHUB_STEP_SUMMARY
@@ -45,6 +45,41 @@
         "platformUrls": {
           "$ref": "./ref/platform-urls.schema.json",
           "description": "URLs to various third-party platform pages for this model"
+        },
+        "benchmarks": {
+          "type": "object",
+          "description": "Performance scores from coding and multimodal benchmarks",
+          "properties": {
+            "sweBench": {
+              "type": ["number", "null"],
+              "description": "SWE-bench Verified score (percentage, e.g., 74.4)"
+            },
+            "terminalBench": {
+              "type": ["number", "null"],
+              "description": "TerminalBench 2.0 accuracy score (decimal, e.g., 0.604)"
+            },
+            "mmmu": {
+              "type": ["number", "null"],
+              "description": "MMMU benchmark score (percentage)"
+            },
+            "mmmuPro": {
+              "type": ["number", "null"],
+              "description": "MMMU Pro benchmark score (percentage)"
+            },
+            "webDevArena": {
+              "type": ["number", "null"],
+              "description": "WebDevArena score"
+            },
+            "sciCode": {
+              "type": ["number", "null"],
+              "description": "SciCode benchmark score (percentage)"
+            },
+            "liveCodeBench": {
+              "type": ["number", "null"],
+              "description": "LiveCodeBench Pass@1 score (percentage)"
+            }
+          },
+          "additionalProperties": false
         }
       },
       "required": ["size", "totalContext", "maxOutput", "tokenPricing", "platformUrls"]
 
@@ -28,6 +28,7 @@
     "refactor:sort-fields": "node scripts/refactor/index.mjs sort-manifest-fields",
     "fetch": "node scripts/fetch/index.mjs",
     "fetch:github-stars": "node scripts/fetch/index.mjs github-stars",
+    "fetch:benchmarks": "node scripts/fetch/index.mjs benchmarks",
     "deploy": "npm run build:next && opennextjs-cloudflare build --skipBuild && opennextjs-cloudflare deploy",
     "preview": "npm run build:next && opennextjs-cloudflare build --skipBuild && opennextjs-cloudflare preview",
     "cf-typegen": "wrangler types --env-interface CloudflareEnv ./cloudflare-env.d.ts",
@@ -67,6 +68,7 @@
     "gray-matter": "^4.0.3",
     "husky": "9.1.7",
     "lint-staged": "16.2.7",
+    "playwright": "^1.48.0",
     "rehype-highlight": "^7.0.2",
     "remark-frontmatter": "^5.0.0",
     "remark-gfm": "^4.0.1",