Overhaul judge and criteria for E2E testing with CLI agent reviewers

jahooma · claude · jahooma · commit bed5094500c4 · 2026-03-26T11:56:31.000-07:00
Major changes:

Judge: Replaced CodebuffClient SDK-based LLM judges with real CLI coding
agents (Claude Code, Codex, Gemini) that run IN the repo. Reviewer agents
can build, run tests, start the dev server, use browser tools, curl
endpoints, check logs — actual E2E verification, not just diff reading.
Structured output via result file (evalbuff-review-result.json) with
fallback to stdout JSON extraction.

Criteria: Shifted from code style (correctness, completeness, pattern
consistency, fluency) to E2E verification levels:
- L1: Builds, existing tests pass, basic completeness
- L2: Feature works E2E (browser/curl/client), logs clean
- L3: Edge cases &amp; error states tested E2E, UI verification
- L4: Cross-component integration, performance, no regressions
- L5: Production readiness (migrations, env vars, error recovery)

Orchestrator: Judge now runs inside withTestRepo callback so reviewer
agents have access to the live repo. CodebuffClient only used for
doc writer (analyzeFailure). Added --reviewers CLI flag.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/evals/evalbuff/__tests__/criteria.test.ts b/evals/evalbuff/__tests__/criteria.test.ts
@@ -26,32 +26,34 @@ describe('getCriteriaForLevel', () => {
     const criteria = getCriteriaForLevel(1)
     expect(criteria).toHaveLength(3)
     expect(criteria.map((c) => c.name)).toEqual([
-      'Correctness',
-      'Completeness',
-      'Basic Style',
+      'Builds & Compiles',
+      'Existing Tests Pass',
+      'Basic Completeness',
     ])
   })
 
   it('accumulates criteria up to level 3', () => {
     const criteria = getCriteriaForLevel(3)
     expect(criteria.map((c) => c.name)).toEqual([
-      'Correctness',
-      'Completeness',
-      'Basic Style',
-      'Pattern Consistency',
-      'Test Quality',
+      'Builds & Compiles',
+      'Existing Tests Pass',
+      'Basic Completeness',
+      'Feature Works E2E',
+      'Logs & Observability',
+      'Edge Cases & Error States',
+      'UI/UX Verification',
     ])
   })
 
   it('includes all criteria at level 5', () => {
     const criteria = getCriteriaForLevel(5)
-    expect(criteria).toHaveLength(7)
-    expect(criteria[criteria.length - 1].name).toBe('Fluency')
+    expect(criteria).toHaveLength(10)
+    expect(criteria[criteria.length - 1].name).toBe('Production Readiness')
   })
 
   it('caps at level 5 even if higher number passed', () => {
     const criteria = getCriteriaForLevel(10)
-    expect(criteria).toHaveLength(7)
+    expect(criteria).toHaveLength(10)
   })
 })
 
@@ -86,26 +88,32 @@ describe('maybePromoteCriteria', () => {
 
   it('uses only the last N scores in the window', () => {
     const criteria = makeCriteria(2, 8.0, 3)
-    // Old scores are low, but last 3 are high
     const scores = [3.0, 4.0, 5.0, 8.5, 9.0, 8.5]
     const newLevel = maybePromoteCriteria(criteria, scores)
     expect(newLevel).toBe(3)
   })
 })
 
 describe('formatCriteriaForPrompt', () => {
-  it('includes level and all criteria names', () => {
+  it('includes level and E2E-focused criteria names', () => {
     const criteria = makeCriteria(2)
     const prompt = formatCriteriaForPrompt(criteria)
     expect(prompt).toContain('Level 2/5')
-    expect(prompt).toContain('Correctness')
-    expect(prompt).toContain('Pattern Consistency')
+    expect(prompt).toContain('Builds & Compiles')
+    expect(prompt).toContain('Feature Works E2E')
   })
 
   it('includes weights', () => {
     const criteria = makeCriteria(1)
     const prompt = formatCriteriaForPrompt(criteria)
     expect(prompt).toContain('weight: 3')
-    expect(prompt).toContain('weight: 1')
+    expect(prompt).toContain('weight: 2')
+  })
+
+  it('instructs E2E verification', () => {
+    const criteria = makeCriteria(1)
+    const prompt = formatCriteriaForPrompt(criteria)
+    expect(prompt).toContain('MUST verify')
+    expect(prompt).toContain('E2E testing')
   })
 })
diff --git a/evals/evalbuff/__tests__/e2e.test.ts b/evals/evalbuff/__tests__/e2e.test.ts
@@ -55,8 +55,10 @@ mock.module('../judge', () => ({
       analysis: `Mock analysis for call ${judgeCallCount}`,
       strengths: ['Correctly identified the problem'],
       weaknesses: ['Missing error handling', 'No tests added'],
+      e2eTestsPerformed: ['Started dev server', 'Tested API endpoint'],
       completionScore: score,
       codeQualityScore: score,
+      e2eScore: score,
       overallScore: score,
     } satisfies JudgingResult
   },
diff --git a/evals/evalbuff/__tests__/loop.integration.test.ts b/evals/evalbuff/__tests__/loop.integration.test.ts
@@ -56,8 +56,10 @@ mock.module('../judge', () => ({
       analysis: 'Mock analysis',
       strengths: ['Good'],
       weaknesses: ['Could improve'],
+      e2eTestsPerformed: ['Mock E2E test'],
       completionScore: score,
       codeQualityScore: score,
+      e2eScore: score,
       overallScore: score,
     } satisfies JudgingResult
   },
diff --git a/evals/evalbuff/criteria.ts b/evals/evalbuff/criteria.ts
@@ -16,54 +16,72 @@ export interface QualityCriteria {
 export const DEFAULT_CRITERIA: Record<number, QualityCriterion[]> = {
   1: [
     {
-      name: 'Correctness',
+      name: 'Builds & Compiles',
       weight: 3,
       description:
-        'The code compiles, runs without errors, and produces the expected behavior.',
+        'The code compiles, builds, and the project starts without errors. Run the build command and verify it succeeds.',
     },
     {
-      name: 'Completeness',
+      name: 'Existing Tests Pass',
       weight: 3,
       description:
-        'All aspects of the prompt are addressed. No partial implementations or TODO comments.',
+        'All pre-existing tests still pass. Run the test suite and confirm no regressions were introduced.',
     },
     {
-      name: 'Basic Style',
-      weight: 1,
+      name: 'Basic Completeness',
+      weight: 2,
       description:
-        'Code follows basic formatting conventions and is readable.',
+        'All aspects of the prompt are addressed. No partial implementations or TODO comments left behind.',
     },
   ],
   2: [
     {
-      name: 'Pattern Consistency',
-      weight: 2,
+      name: 'Feature Works E2E',
+      weight: 4,
       description:
-        'New code follows the same patterns, naming conventions, and architectural style as existing code in the codebase.',
+        'The new feature or bug fix actually works when you use the application. Start the app, navigate to the relevant page or endpoint, and exercise the feature. Use browser tools, curl, or the appropriate client to verify the happy path end-to-end.',
+    },
+    {
+      name: 'Logs & Observability',
+      weight: 1,
+      description:
+        'Check application logs for errors, warnings, or stack traces during E2E testing. Verify no unexpected errors appear when exercising the feature.',
     },
   ],
   3: [
     {
-      name: 'Test Quality',
+      name: 'Edge Cases & Error States',
+      weight: 3,
+      description:
+        'Test error states and edge cases E2E. Submit invalid inputs, trigger error conditions, test boundary values. Verify the app handles them gracefully without crashing.',
+    },
+    {
+      name: 'UI/UX Verification',
       weight: 2,
       description:
-        'Tests are meaningful, cover edge cases, and test behavior rather than implementation details.',
+        'For UI changes: visually verify the rendered output. Check layout, responsiveness, and that the UI matches expectations. Take screenshots to document.',
     },
   ],
   4: [
     {
-      name: 'Optimal Design',
+      name: 'Cross-Component Integration',
       weight: 2,
       description:
-        'Code is DRY, uses the right abstractions, and the diff is minimal — no unnecessary changes.',
+        'Verify the change works correctly with related features. Test flows that cross component boundaries. If a backend change was made, verify the frontend still works. If a DB migration was added, verify queries work.',
+    },
+    {
+      name: 'Performance & No Regressions',
+      weight: 2,
+      description:
+        'Verify no performance regressions. Check page load times, API response times, or resource usage. Ensure the change does not break unrelated features.',
     },
   ],
   5: [
     {
-      name: 'Fluency',
-      weight: 1,
+      name: 'Production Readiness',
+      weight: 2,
       description:
-        'Code reads like a senior engineer wrote it. Idiomatic usage of the language and framework. No over-engineering.',
+        'Full production readiness check. Verify migrations, environment variable handling, error recovery, and graceful degradation. The change should be safe to deploy.',
     },
   ],
 }
@@ -122,13 +140,13 @@ export function maybePromoteCriteria(
 }
 
 /**
- * Format criteria as text for injection into judge prompts.
+ * Format criteria as text for injection into reviewer agent prompts.
  */
 export function formatCriteriaForPrompt(criteria: QualityCriteria): string {
   const lines = [
     `## Quality Criteria (Level ${criteria.level}/5)`,
     '',
-    'Apply these additional quality criteria when scoring. Higher levels add stricter standards:',
+    'You MUST verify each of these criteria. Higher levels require deeper E2E testing:',
     '',
   ]
 
@@ -138,7 +156,9 @@ export function formatCriteriaForPrompt(criteria: QualityCriteria): string {
 
   lines.push(
     '',
-    'Weight these criteria proportionally when computing scores. A violation of a high-weight criterion should have a bigger impact on the score than a low-weight one.',
+    'For each criterion, describe what you tested and what you observed. If you cannot test a criterion (e.g., no UI for a backend change), note that and explain why.',
+    '',
+    'Weight these criteria proportionally when computing scores. A failure on a high-weight criterion should have a bigger impact on the score than a low-weight one.',
   )
 
   return lines.join('\n')
diff --git a/evals/evalbuff/evalbuff-criteria.json b/evals/evalbuff/evalbuff-criteria.json
@@ -2,19 +2,19 @@
   "level": 1,
   "criteria": [
     {
-      "name": "Correctness",
+      "name": "Builds & Compiles",
       "weight": 3,
-      "description": "The code compiles, runs without errors, and produces the expected behavior."
+      "description": "The code compiles, builds, and the project starts without errors. Run the build command and verify it succeeds."
     },
     {
-      "name": "Completeness",
+      "name": "Existing Tests Pass",
       "weight": 3,
-      "description": "All aspects of the prompt are addressed. No partial implementations or TODO comments."
+      "description": "All pre-existing tests still pass. Run the test suite and confirm no regressions were introduced."
     },
     {
-      "name": "Basic Style",
-      "weight": 1,
-      "description": "Code follows basic formatting conventions and is readable."
+      "name": "Basic Completeness",
+      "weight": 2,
+      "description": "All aspects of the prompt are addressed. No partial implementations or TODO comments left behind."
     }
   ],
   "promotionThreshold": 8.0,
diff --git a/evals/evalbuff/judge.ts b/evals/evalbuff/judge.ts
diff --git a/evals/evalbuff/run-evalbuff.ts b/evals/evalbuff/run-evalbuff.ts