Skip to content

Commit bed5094

Browse files
jahoomaclaude
andcommitted
Overhaul judge and criteria for E2E testing with CLI agent reviewers
Major changes: Judge: Replaced CodebuffClient SDK-based LLM judges with real CLI coding agents (Claude Code, Codex, Gemini) that run IN the repo. Reviewer agents can build, run tests, start the dev server, use browser tools, curl endpoints, check logs — actual E2E verification, not just diff reading. Structured output via result file (evalbuff-review-result.json) with fallback to stdout JSON extraction. Criteria: Shifted from code style (correctness, completeness, pattern consistency, fluency) to E2E verification levels: - L1: Builds, existing tests pass, basic completeness - L2: Feature works E2E (browser/curl/client), logs clean - L3: Edge cases & error states tested E2E, UI verification - L4: Cross-component integration, performance, no regressions - L5: Production readiness (migrations, env vars, error recovery) Orchestrator: Judge now runs inside withTestRepo callback so reviewer agents have access to the live repo. CodebuffClient only used for doc writer (analyzeFailure). Added --reviewers CLI flag. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1a754ce commit bed5094

File tree

7 files changed

+507
-291
lines changed

7 files changed

+507
-291
lines changed

evals/evalbuff/__tests__/criteria.test.ts

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,32 +26,34 @@ describe('getCriteriaForLevel', () => {
2626
const criteria = getCriteriaForLevel(1)
2727
expect(criteria).toHaveLength(3)
2828
expect(criteria.map((c) => c.name)).toEqual([
29-
'Correctness',
30-
'Completeness',
31-
'Basic Style',
29+
'Builds & Compiles',
30+
'Existing Tests Pass',
31+
'Basic Completeness',
3232
])
3333
})
3434

3535
it('accumulates criteria up to level 3', () => {
3636
const criteria = getCriteriaForLevel(3)
3737
expect(criteria.map((c) => c.name)).toEqual([
38-
'Correctness',
39-
'Completeness',
40-
'Basic Style',
41-
'Pattern Consistency',
42-
'Test Quality',
38+
'Builds & Compiles',
39+
'Existing Tests Pass',
40+
'Basic Completeness',
41+
'Feature Works E2E',
42+
'Logs & Observability',
43+
'Edge Cases & Error States',
44+
'UI/UX Verification',
4345
])
4446
})
4547

4648
it('includes all criteria at level 5', () => {
4749
const criteria = getCriteriaForLevel(5)
48-
expect(criteria).toHaveLength(7)
49-
expect(criteria[criteria.length - 1].name).toBe('Fluency')
50+
expect(criteria).toHaveLength(10)
51+
expect(criteria[criteria.length - 1].name).toBe('Production Readiness')
5052
})
5153

5254
it('caps at level 5 even if higher number passed', () => {
5355
const criteria = getCriteriaForLevel(10)
54-
expect(criteria).toHaveLength(7)
56+
expect(criteria).toHaveLength(10)
5557
})
5658
})
5759

@@ -86,26 +88,32 @@ describe('maybePromoteCriteria', () => {
8688

8789
it('uses only the last N scores in the window', () => {
8890
const criteria = makeCriteria(2, 8.0, 3)
89-
// Old scores are low, but last 3 are high
9091
const scores = [3.0, 4.0, 5.0, 8.5, 9.0, 8.5]
9192
const newLevel = maybePromoteCriteria(criteria, scores)
9293
expect(newLevel).toBe(3)
9394
})
9495
})
9596

9697
describe('formatCriteriaForPrompt', () => {
97-
it('includes level and all criteria names', () => {
98+
it('includes level and E2E-focused criteria names', () => {
9899
const criteria = makeCriteria(2)
99100
const prompt = formatCriteriaForPrompt(criteria)
100101
expect(prompt).toContain('Level 2/5')
101-
expect(prompt).toContain('Correctness')
102-
expect(prompt).toContain('Pattern Consistency')
102+
expect(prompt).toContain('Builds & Compiles')
103+
expect(prompt).toContain('Feature Works E2E')
103104
})
104105

105106
it('includes weights', () => {
106107
const criteria = makeCriteria(1)
107108
const prompt = formatCriteriaForPrompt(criteria)
108109
expect(prompt).toContain('weight: 3')
109-
expect(prompt).toContain('weight: 1')
110+
expect(prompt).toContain('weight: 2')
111+
})
112+
113+
it('instructs E2E verification', () => {
114+
const criteria = makeCriteria(1)
115+
const prompt = formatCriteriaForPrompt(criteria)
116+
expect(prompt).toContain('MUST verify')
117+
expect(prompt).toContain('E2E testing')
110118
})
111119
})

evals/evalbuff/__tests__/e2e.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,10 @@ mock.module('../judge', () => ({
5555
analysis: `Mock analysis for call ${judgeCallCount}`,
5656
strengths: ['Correctly identified the problem'],
5757
weaknesses: ['Missing error handling', 'No tests added'],
58+
e2eTestsPerformed: ['Started dev server', 'Tested API endpoint'],
5859
completionScore: score,
5960
codeQualityScore: score,
61+
e2eScore: score,
6062
overallScore: score,
6163
} satisfies JudgingResult
6264
},

evals/evalbuff/__tests__/loop.integration.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,10 @@ mock.module('../judge', () => ({
5656
analysis: 'Mock analysis',
5757
strengths: ['Good'],
5858
weaknesses: ['Could improve'],
59+
e2eTestsPerformed: ['Mock E2E test'],
5960
completionScore: score,
6061
codeQualityScore: score,
62+
e2eScore: score,
6163
overallScore: score,
6264
} satisfies JudgingResult
6365
},

evals/evalbuff/criteria.ts

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,54 +16,72 @@ export interface QualityCriteria {
1616
export const DEFAULT_CRITERIA: Record<number, QualityCriterion[]> = {
1717
1: [
1818
{
19-
name: 'Correctness',
19+
name: 'Builds & Compiles',
2020
weight: 3,
2121
description:
22-
'The code compiles, runs without errors, and produces the expected behavior.',
22+
'The code compiles, builds, and the project starts without errors. Run the build command and verify it succeeds.',
2323
},
2424
{
25-
name: 'Completeness',
25+
name: 'Existing Tests Pass',
2626
weight: 3,
2727
description:
28-
'All aspects of the prompt are addressed. No partial implementations or TODO comments.',
28+
'All pre-existing tests still pass. Run the test suite and confirm no regressions were introduced.',
2929
},
3030
{
31-
name: 'Basic Style',
32-
weight: 1,
31+
name: 'Basic Completeness',
32+
weight: 2,
3333
description:
34-
'Code follows basic formatting conventions and is readable.',
34+
'All aspects of the prompt are addressed. No partial implementations or TODO comments left behind.',
3535
},
3636
],
3737
2: [
3838
{
39-
name: 'Pattern Consistency',
40-
weight: 2,
39+
name: 'Feature Works E2E',
40+
weight: 4,
4141
description:
42-
'New code follows the same patterns, naming conventions, and architectural style as existing code in the codebase.',
42+
'The new feature or bug fix actually works when you use the application. Start the app, navigate to the relevant page or endpoint, and exercise the feature. Use browser tools, curl, or the appropriate client to verify the happy path end-to-end.',
43+
},
44+
{
45+
name: 'Logs & Observability',
46+
weight: 1,
47+
description:
48+
'Check application logs for errors, warnings, or stack traces during E2E testing. Verify no unexpected errors appear when exercising the feature.',
4349
},
4450
],
4551
3: [
4652
{
47-
name: 'Test Quality',
53+
name: 'Edge Cases & Error States',
54+
weight: 3,
55+
description:
56+
'Test error states and edge cases E2E. Submit invalid inputs, trigger error conditions, test boundary values. Verify the app handles them gracefully without crashing.',
57+
},
58+
{
59+
name: 'UI/UX Verification',
4860
weight: 2,
4961
description:
50-
'Tests are meaningful, cover edge cases, and test behavior rather than implementation details.',
62+
'For UI changes: visually verify the rendered output. Check layout, responsiveness, and that the UI matches expectations. Take screenshots to document.',
5163
},
5264
],
5365
4: [
5466
{
55-
name: 'Optimal Design',
67+
name: 'Cross-Component Integration',
5668
weight: 2,
5769
description:
58-
'Code is DRY, uses the right abstractions, and the diff is minimal — no unnecessary changes.',
70+
'Verify the change works correctly with related features. Test flows that cross component boundaries. If a backend change was made, verify the frontend still works. If a DB migration was added, verify queries work.',
71+
},
72+
{
73+
name: 'Performance & No Regressions',
74+
weight: 2,
75+
description:
76+
'Verify no performance regressions. Check page load times, API response times, or resource usage. Ensure the change does not break unrelated features.',
5977
},
6078
],
6179
5: [
6280
{
63-
name: 'Fluency',
64-
weight: 1,
81+
name: 'Production Readiness',
82+
weight: 2,
6583
description:
66-
'Code reads like a senior engineer wrote it. Idiomatic usage of the language and framework. No over-engineering.',
84+
'Full production readiness check. Verify migrations, environment variable handling, error recovery, and graceful degradation. The change should be safe to deploy.',
6785
},
6886
],
6987
}
@@ -122,13 +140,13 @@ export function maybePromoteCriteria(
122140
}
123141

124142
/**
125-
* Format criteria as text for injection into judge prompts.
143+
* Format criteria as text for injection into reviewer agent prompts.
126144
*/
127145
export function formatCriteriaForPrompt(criteria: QualityCriteria): string {
128146
const lines = [
129147
`## Quality Criteria (Level ${criteria.level}/5)`,
130148
'',
131-
'Apply these additional quality criteria when scoring. Higher levels add stricter standards:',
149+
'You MUST verify each of these criteria. Higher levels require deeper E2E testing:',
132150
'',
133151
]
134152

@@ -138,7 +156,9 @@ export function formatCriteriaForPrompt(criteria: QualityCriteria): string {
138156

139157
lines.push(
140158
'',
141-
'Weight these criteria proportionally when computing scores. A violation of a high-weight criterion should have a bigger impact on the score than a low-weight one.',
159+
'For each criterion, describe what you tested and what you observed. If you cannot test a criterion (e.g., no UI for a backend change), note that and explain why.',
160+
'',
161+
'Weight these criteria proportionally when computing scores. A failure on a high-weight criterion should have a bigger impact on the score than a low-weight one.',
142162
)
143163

144164
return lines.join('\n')

evals/evalbuff/evalbuff-criteria.json

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,19 @@
22
"level": 1,
33
"criteria": [
44
{
5-
"name": "Correctness",
5+
"name": "Builds & Compiles",
66
"weight": 3,
7-
"description": "The code compiles, runs without errors, and produces the expected behavior."
7+
"description": "The code compiles, builds, and the project starts without errors. Run the build command and verify it succeeds."
88
},
99
{
10-
"name": "Completeness",
10+
"name": "Existing Tests Pass",
1111
"weight": 3,
12-
"description": "All aspects of the prompt are addressed. No partial implementations or TODO comments."
12+
"description": "All pre-existing tests still pass. Run the test suite and confirm no regressions were introduced."
1313
},
1414
{
15-
"name": "Basic Style",
16-
"weight": 1,
17-
"description": "Code follows basic formatting conventions and is readable."
15+
"name": "Basic Completeness",
16+
"weight": 2,
17+
"description": "All aspects of the prompt are addressed. No partial implementations or TODO comments left behind."
1818
}
1919
],
2020
"promotionThreshold": 8.0,

0 commit comments

Comments
 (0)