Skip to content

Commit 1a754ce

Browse files
jahoomaclaude
andcommitted
Add E2E test for evalbuff full loop
Verifies the complete evalbuff pipeline: 3 eval tasks run through the orchestrator with mock LLM judges, doc edits applied and committed, morning report generated, state tracking, and AGENTS.md TOC created. Total test coverage: 42 tests (35 unit + 6 integration + 1 E2E). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 884a565 commit 1a754ce

File tree

2 files changed

+232
-1
lines changed

2 files changed

+232
-1
lines changed
Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
/**
2+
* E2E test for evalbuff.
3+
*
4+
* This test runs the full evalbuff loop with a real (mock) agent on a local
5+
* git repo with synthetic eval tasks. It verifies:
6+
* - The morning report is generated
7+
* - Log entries are written
8+
* - State file tracks completed tasks
9+
* - Doc edits are committed to the repo when they improve scores
10+
*
11+
* This test uses mock.module to replace LLM calls but runs the full
12+
* orchestrator, CLI runner, and git operations for real.
13+
*
14+
* Run: bun test evals/evalbuff/__tests__/e2e.test.ts
15+
*/
16+
import { execSync } from 'child_process'
17+
import fs from 'fs'
18+
import os from 'os'
19+
import path from 'path'
20+
21+
import { afterAll, beforeAll, describe, expect, it, mock } from 'bun:test'
22+
23+
import type { JudgingResult } from '../judge'
24+
import type { DocSuggestion } from '../docs-optimizer'
25+
import type { EvalDataV2 } from '../types'
26+
27+
// --- Mocks for LLM calls only ---
28+
29+
let judgeCallCount = 0
30+
31+
mock.module('../test-repo-utils', () => ({
32+
withTestRepo: async (_config: any, fn: (cwd: string) => Promise<any>) => {
33+
// Create a real local git repo for each call
34+
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-repo-'))
35+
execSync('git init && git add . && git commit --allow-empty -m "init"', {
36+
cwd: dir,
37+
stdio: 'ignore',
38+
env: { ...process.env, GIT_AUTHOR_NAME: 'test', GIT_AUTHOR_EMAIL: 'test@test.com', GIT_COMMITTER_NAME: 'test', GIT_COMMITTER_EMAIL: 'test@test.com' },
39+
})
40+
try {
41+
return await fn(dir)
42+
} finally {
43+
fs.rmSync(dir, { recursive: true, force: true })
44+
}
45+
},
46+
}))
47+
48+
// Judge returns alternating scores: low (triggers doc edit), then higher (confirms improvement)
49+
mock.module('../judge', () => ({
50+
judgeCommitResult: async () => {
51+
const scores = [3.0, 6.0, 8.5, 5.0, 7.0, 9.0]
52+
const score = scores[judgeCallCount % scores.length]
53+
judgeCallCount++
54+
return {
55+
analysis: `Mock analysis for call ${judgeCallCount}`,
56+
strengths: ['Correctly identified the problem'],
57+
weaknesses: ['Missing error handling', 'No tests added'],
58+
completionScore: score,
59+
codeQualityScore: score,
60+
overallScore: score,
61+
} satisfies JudgingResult
62+
},
63+
}))
64+
65+
const actualDocsOptimizer = await import('../docs-optimizer')
66+
mock.module('../docs-optimizer', () => ({
67+
...actualDocsOptimizer,
68+
analyzeFailure: async () =>
69+
({
70+
reasoning: 'Agent consistently misses error handling patterns in async code',
71+
suggestedDocPath: 'patterns/async-error-handling.md',
72+
suggestedContent:
73+
'# Async Error Handling\n\nAll async functions should use try/catch blocks.\nPropagate errors with meaningful messages.\n\n## Examples\n\n```ts\nasync function fetchData() {\n try {\n const result = await api.get("/data")\n return result\n } catch (error) {\n throw new Error(`Failed to fetch data: ${error.message}`)\n }\n}\n```\n',
74+
}) satisfies DocSuggestion,
75+
}))
76+
77+
mock.module('@codebuff/sdk', () => ({
78+
CodebuffClient: class {
79+
constructor() {}
80+
},
81+
}))
82+
83+
const { runEvalbuff } = await import('../run-evalbuff')
84+
85+
// --- Test setup ---
86+
87+
let repoDir: string
88+
let evalFilePath: string
89+
90+
beforeAll(() => {
91+
// Create a "target repo" where docs will be written
92+
repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-target-'))
93+
execSync('git init && git add . && git commit --allow-empty -m "init"', {
94+
cwd: repoDir,
95+
stdio: 'ignore',
96+
env: { ...process.env, GIT_AUTHOR_NAME: 'test', GIT_AUTHOR_EMAIL: 'test@test.com', GIT_COMMITTER_NAME: 'test', GIT_COMMITTER_EMAIL: 'test@test.com' },
97+
})
98+
99+
// Create eval file with 3 tasks
100+
const evalData: EvalDataV2 = {
101+
repoUrl: 'https://github.com/test/repo',
102+
generationDate: '2026-03-25',
103+
evalCommits: [
104+
{
105+
id: 'e2e-task-1',
106+
sha: 'aaa111',
107+
parentSha: 'aaa000',
108+
spec: 'Add error handling to fetchData',
109+
prompt: 'Add try/catch error handling to the fetchData function in src/api.ts',
110+
supplementalFiles: [],
111+
fileDiffs: [
112+
{
113+
path: 'src/api.ts',
114+
status: 'modified',
115+
diff: '@@ -5,3 +5,7 @@\n-const data = await fetch(url)\n+try {\n+ const data = await fetch(url)\n+} catch (e) {\n+ throw new Error(`Fetch failed: ${e.message}`)\n+}',
116+
},
117+
],
118+
},
119+
{
120+
id: 'e2e-task-2',
121+
sha: 'bbb222',
122+
parentSha: 'bbb000',
123+
spec: 'Add input validation',
124+
prompt: 'Add input validation to the createUser endpoint',
125+
supplementalFiles: [],
126+
fileDiffs: [
127+
{
128+
path: 'src/routes/users.ts',
129+
status: 'modified',
130+
diff: '@@ -1 +1,5 @@\n+if (!name || !email) {\n+ throw new Error("name and email required")\n+}',
131+
},
132+
],
133+
},
134+
{
135+
id: 'e2e-task-3',
136+
sha: 'ccc333',
137+
parentSha: 'ccc000',
138+
spec: 'Refactor logger',
139+
prompt: 'Refactor the logger to use structured JSON output',
140+
supplementalFiles: [],
141+
fileDiffs: [
142+
{
143+
path: 'src/logger.ts',
144+
status: 'modified',
145+
diff: '@@ -1 +1,3 @@\n-console.log(msg)\n+const entry = { timestamp: Date.now(), message: msg }\n+process.stdout.write(JSON.stringify(entry) + "\\n")',
146+
},
147+
],
148+
},
149+
],
150+
}
151+
152+
evalFilePath = path.join(repoDir, 'eval-e2e.json')
153+
fs.writeFileSync(evalFilePath, JSON.stringify(evalData))
154+
155+
judgeCallCount = 0
156+
})
157+
158+
afterAll(() => {
159+
fs.rmSync(repoDir, { recursive: true, force: true })
160+
})
161+
162+
// --- E2E tests ---
163+
164+
describe('evalbuff E2E', () => {
165+
it('runs full loop: agent, judge, doc edit, morning report', async () => {
166+
await runEvalbuff({
167+
repoPath: repoDir,
168+
agentCommand: 'echo', // echo just prints the prompt and exits
169+
evalDataPaths: [evalFilePath],
170+
maxIterations: 3,
171+
maxCostUsd: 50,
172+
scoreThreshold: 7.0,
173+
agentTimeoutMs: 10_000,
174+
})
175+
176+
// 1. Morning report exists
177+
const reportFiles = fs
178+
.readdirSync(repoDir)
179+
.filter((f) => f.startsWith('evalbuff-report-'))
180+
expect(reportFiles.length).toBe(1)
181+
const report = fs.readFileSync(
182+
path.join(repoDir, reportFiles[0]),
183+
'utf-8',
184+
)
185+
expect(report).toContain('# Evalbuff Morning Report')
186+
expect(report).toContain('Iterations | 3')
187+
188+
// 2. Log has 3 entries
189+
const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
190+
expect(fs.existsSync(logPath)).toBe(true)
191+
const logLines = fs
192+
.readFileSync(logPath, 'utf-8')
193+
.trim()
194+
.split('\n')
195+
expect(logLines).toHaveLength(3)
196+
197+
// 3. State tracks all 3 completed tasks
198+
const statePath = path.join(repoDir, 'evalbuff-state.json')
199+
const state = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
200+
expect(state.completedTaskIds).toEqual([
201+
'e2e-task-1',
202+
'e2e-task-2',
203+
'e2e-task-3',
204+
])
205+
206+
// 4. At least one doc was written (first task scores 3.0, below threshold)
207+
const docsDir = path.join(repoDir, 'docs')
208+
expect(fs.existsSync(docsDir)).toBe(true)
209+
210+
// 5. AGENTS.md was created with TOC
211+
const agentsMdPath = path.join(repoDir, 'AGENTS.md')
212+
expect(fs.existsSync(agentsMdPath)).toBe(true)
213+
const agentsMd = fs.readFileSync(agentsMdPath, 'utf-8')
214+
expect(agentsMd).toContain('async-error-handling.md')
215+
216+
// 6. Doc edits were committed to git
217+
const gitLog = execSync('git log --oneline', {
218+
cwd: repoDir,
219+
encoding: 'utf-8',
220+
})
221+
expect(gitLog).toContain('evalbuff:')
222+
223+
// 7. Log entries have correct task IDs
224+
const parsedEntries = logLines.map((l) => JSON.parse(l))
225+
expect(parsedEntries.map((e: any) => e.taskId)).toEqual([
226+
'e2e-task-1',
227+
'e2e-task-2',
228+
'e2e-task-3',
229+
])
230+
})
231+
})

evals/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"run-buffbench": "bun run buffbench/main.ts",
2525
"run-buffbench-nightly": "bun run buffbench/main-nightly.ts",
2626
"run-evalbuff": "bun run evalbuff/run-evalbuff.ts",
27-
"test:evalbuff": "bun test evalbuff/__tests__/criteria.test.ts evalbuff/__tests__/docs-optimizer.test.ts evalbuff/__tests__/morning-report.test.ts evalbuff/__tests__/cli-runner.test.ts && bun test evalbuff/__tests__/loop.integration.test.ts",
27+
"test:evalbuff": "bun test evalbuff/__tests__/criteria.test.ts evalbuff/__tests__/docs-optimizer.test.ts evalbuff/__tests__/morning-report.test.ts evalbuff/__tests__/cli-runner.test.ts && bun test evalbuff/__tests__/loop.integration.test.ts && bun test evalbuff/__tests__/e2e.test.ts",
2828
"trigger-buffbench": "bun run scripts/trigger-buffbench.ts",
2929
"setup-codebuff-repo": "bun run setup-codebuff-repo.ts"
3030
},

0 commit comments

Comments
 (0)