Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
5b7cca0
Initial plan
Copilot Mar 8, 2026
23aecf0
feat: scaffold eval test suite with three skill tasks and CI workflow
Copilot Mar 8, 2026
ac1335a
refactor: improve regex readability in grader scripts per code review
Copilot Mar 8, 2026
f807aa3
fix: add explicit permissions to skill-eval workflow (CodeQL alert)
Copilot Mar 8, 2026
c183089
Merge branch 'master' into copilot/implement-automated-eval-test-suite
kdinev Mar 9, 2026
6e7b838
fix: replace skill-eval package dep with self-contained local runner
Copilot Mar 9, 2026
b2047d8
fix: emit passRate/passAtK in result JSON so CI summary shows actual …
Copilot Mar 10, 2026
1691296
Update evals/tasks/component-combo-reactive-form/solution/solve.sh
kdinev Mar 10, 2026
2df335e
Update .github/workflows/skill-eval.yml
kdinev Mar 10, 2026
b22b13f
fix: tighten grader checks per review feedback
Copilot Mar 10, 2026
c684351
Merge branch 'master' into copilot/implement-automated-eval-test-suite
kdinev Mar 10, 2026
94d4bf8
Update Node.js version in skill-eval workflow
kdinev Mar 10, 2026
18f3e25
Update skill-eval.yml
kdinev Mar 10, 2026
568b04d
Remove eval dependencies installation step
kdinev Mar 10, 2026
5da6711
Merge branch 'master' into copilot/implement-automated-eval-test-suite
kdinev Mar 10, 2026
b181ca0
feat: add copilot-cli and gemini-cli agent modes to eval runner
Copilot Mar 10, 2026
665264b
fix: use read -ra for safe array parsing, add TRIALS guard
Copilot Mar 10, 2026
b3fa973
Update skill-eval.yml
kdinev Mar 10, 2026
1330989
rework CI workflow to always run against both copilot and gemini agents
Copilot Mar 10, 2026
a9da524
add agent prompt files, switch CI to npm scripts, clean up README
Copilot Mar 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions .github/workflows/skill-eval.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
name: Skill Eval

on:
pull_request:
paths:
- 'skills/**'
- 'evals/**'
workflow_dispatch:

permissions:
contents: read
pull-requests: write

jobs:
# Job 1: Validate graders against reference solutions
validate_graders:
runs-on: ubuntu-latest
timeout-minutes: 10

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '22'

- name: Validate graders against reference solutions
working-directory: evals
run: npm run validate

- name: Upload validation results
if: always()
uses: actions/upload-artifact@v4
with:
name: skill-eval-validation-results
path: evals/results/
retention-days: 30

# Job 2: Run evals against the Copilot agent
agent_eval_copilot:
runs-on: ubuntu-latest
timeout-minutes: 60

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '22'

- name: Install Copilot CLI
run: npm install -g @github/copilot

- name: Run eval against Copilot
working-directory: evals
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: npm run agent:copilot

- name: Upload Copilot eval results
if: always()
uses: actions/upload-artifact@v4
with:
name: skill-eval-agent-copilot-results
path: evals/results/
retention-days: 30

# Job 3: Run evals against the Gemini agent
agent_eval_gemini:
runs-on: ubuntu-latest
timeout-minutes: 60

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '22'

- name: Install Gemini CLI
run: npm install -g @google/gemini-cli

- name: Run eval against Gemini
working-directory: evals
env:
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
run: npm run agent:gemini

- name: Upload Gemini eval results
if: always()
uses: actions/upload-artifact@v4
with:
name: skill-eval-agent-gemini-results
path: evals/results/
retention-days: 30

# Job 4: Post combined summary comment on PRs
post_summary:
if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
needs: [validate_graders, agent_eval_copilot, agent_eval_gemini]
runs-on: ubuntu-latest

steps:
- name: Download validation results
uses: actions/download-artifact@v4
with:
name: skill-eval-validation-results
path: evals/results/validation
continue-on-error: true

- name: Download Copilot results
uses: actions/download-artifact@v4
with:
name: skill-eval-agent-copilot-results
path: evals/results/copilot
continue-on-error: true

- name: Download Gemini results
uses: actions/download-artifact@v4
with:
name: skill-eval-agent-gemini-results
path: evals/results/gemini
continue-on-error: true

- name: Post summary comment
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = require('path');

function readResults(dir) {
const results = [];
try {
if (!fs.existsSync(dir)) return results;
const files = fs.readdirSync(dir).filter(f => f.endsWith('.json') && f !== 'baseline.json');
for (const file of files) {
try {
results.push(JSON.parse(fs.readFileSync(path.join(dir, file), 'utf8')));
} catch (e) {
results.push({ task: file.replace('.json', ''), error: true });
}
}
} catch (e) { /* dir doesn't exist */ }
return results;
}

let summary = '## 📊 Skill Eval Results\n\n';

// --- Validation results ---
const validation = readResults('evals/results/validation');
if (validation.length > 0) {
summary += '### Grader Validation (reference solutions)\n\n';
summary += '| Task | Pass Rate | Status |\n';
summary += '|---|---|---|\n';
for (const r of validation) {
if (r.error) { summary += `| ${r.task} | Error | ❌ |\n`; continue; }
const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
const status = r.passRate >= 1.0 ? '✅' : '❌';
summary += `| ${r.task} | ${passRate} | ${status} |\n`;
}
summary += '\n';
}

// --- Agent results ---
const copilot = readResults('evals/results/copilot');
const gemini = readResults('evals/results/gemini');

if (copilot.length > 0 || gemini.length > 0) {
summary += '### Agent Evaluation\n\n';
summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
summary += '|---|---|---|---|---|\n';

for (const r of [...copilot, ...gemini]) {
if (r.error) { summary += `| ${r.task} | — | Error | Error | ❌ |\n`; continue; }
const taskName = r.task || 'unknown';
const agent = r.agent || 'unknown';
const passRate = r.passRate != null ? `${(r.passRate * 100).toFixed(0)}%` : 'N/A';
const passAtK = r.passAtK != null ? `${(r.passAtK * 100).toFixed(0)}%` : 'N/A';
const status = r.passAtK >= 0.8 ? '✅' : r.passAtK >= 0.6 ? '⚠️' : '❌';
summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
}
summary += '\n';
}

if (validation.length === 0 && copilot.length === 0 && gemini.length === 0) {
summary += '> ⚠️ No eval results found. The eval runs may have failed.\n';
}

summary += '### Thresholds\n';
summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';

await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: summary,
});
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,8 @@ extras/docs/themes/sassdoc/sassdoc/*

# Localization sources
i18nRepo

# Eval artifacts (keep baseline results)
evals/node_modules
evals/results/*.json
!evals/results/baseline.json
Loading
Loading