Skip to content

Commit 884a565

Browse files
jahoomaclaude
andcommitted
Add integration tests for evalbuff orchestrator loop
6 integration tests covering: full iteration flow, doc edit attempts, maxIterations budget cap, cost budget cap, resume from state file, and doc revert on score regression. Uses bun mock.module to avoid real LLM calls and remote repo cloning. Also guards run-evalbuff.ts CLI entry point with import.meta.main and adds test:evalbuff script that runs unit + integration tests in separate processes to avoid mock.module leakage. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f411a3f commit 884a565

File tree

3 files changed

+348
-4
lines changed

3 files changed

+348
-4
lines changed
Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
import { execSync } from 'child_process'
2+
import fs from 'fs'
3+
import os from 'os'
4+
import path from 'path'
5+
6+
import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test'
7+
8+
import type { JudgingResult } from '../judge'
9+
import type { DocSuggestion } from '../docs-optimizer'
10+
import type { EvalDataV2 } from '../types'
11+
12+
// --- Mocks ---
13+
14+
// Track calls to mocked functions
15+
let judgeCallCount = 0
16+
let judgeScores: number[] = []
17+
let analyzeFailureResult: DocSuggestion | null = null
18+
let cliRunnerCallCount = 0
19+
20+
// Mock withTestRepo to use a local temp dir instead of cloning
21+
mock.module('../test-repo-utils', () => ({
22+
withTestRepo: async (_config: any, fn: (cwd: string) => Promise<any>) => {
23+
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-mock-repo-'))
24+
execSync('git init && git add . && git commit --allow-empty -m "init"', {
25+
cwd: dir,
26+
stdio: 'ignore',
27+
})
28+
try {
29+
return await fn(dir)
30+
} finally {
31+
fs.rmSync(dir, { recursive: true, force: true })
32+
}
33+
},
34+
}))
35+
36+
// Mock CLI runner to return a fake result
37+
mock.module('../cli-runner', () => ({
38+
runCliAgent: async () => {
39+
cliRunnerCallCount++
40+
return {
41+
diff: 'mock diff content',
42+
durationMs: 1000,
43+
exitCode: 0,
44+
stdout: 'mock stdout',
45+
stderr: '',
46+
}
47+
},
48+
}))
49+
50+
// Mock judge to return configurable scores
51+
mock.module('../judge', () => ({
52+
judgeCommitResult: async () => {
53+
const score = judgeScores[judgeCallCount] ?? 5.0
54+
judgeCallCount++
55+
return {
56+
analysis: 'Mock analysis',
57+
strengths: ['Good'],
58+
weaknesses: ['Could improve'],
59+
completionScore: score,
60+
codeQualityScore: score,
61+
overallScore: score,
62+
} satisfies JudgingResult
63+
},
64+
}))
65+
66+
// Mock docs-optimizer LLM calls but keep pure functions
67+
const actualDocsOptimizer = await import('../docs-optimizer')
68+
mock.module('../docs-optimizer', () => ({
69+
...actualDocsOptimizer,
70+
analyzeFailure: async () => analyzeFailureResult,
71+
}))
72+
73+
// Mock CodebuffClient
74+
mock.module('@codebuff/sdk', () => ({
75+
CodebuffClient: class {
76+
constructor() {}
77+
async run() {
78+
return { output: { type: 'text', value: '' } }
79+
}
80+
},
81+
}))
82+
83+
// Import after mocks are set up
84+
const { runEvalbuff } = await import('../run-evalbuff')
85+
86+
// --- Test fixtures ---
87+
88+
let repoDir: string
89+
let evalFilePath: string
90+
91+
function createEvalFile(taskCount: number): string {
92+
const evalData: EvalDataV2 = {
93+
repoUrl: 'https://github.com/test/repo',
94+
generationDate: '2026-03-25',
95+
evalCommits: Array.from({ length: taskCount }, (_, i) => ({
96+
id: `task-${i + 1}`,
97+
sha: `sha-${i + 1}`,
98+
parentSha: `parent-${i + 1}`,
99+
spec: `Test task ${i + 1}`,
100+
prompt: `Do task ${i + 1}`,
101+
supplementalFiles: [],
102+
fileDiffs: [
103+
{
104+
path: `src/file${i + 1}.ts`,
105+
status: 'modified' as const,
106+
diff: `@@ -1 +1 @@\n-old\n+new`,
107+
},
108+
],
109+
})),
110+
}
111+
112+
const filePath = path.join(repoDir, `eval-test.json`)
113+
fs.writeFileSync(filePath, JSON.stringify(evalData))
114+
return filePath
115+
}
116+
117+
beforeEach(() => {
118+
repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-integ-'))
119+
execSync('git init && git add . && git commit --allow-empty -m "init"', {
120+
cwd: repoDir,
121+
stdio: 'ignore',
122+
})
123+
evalFilePath = createEvalFile(5)
124+
125+
// Reset mock state
126+
judgeCallCount = 0
127+
judgeScores = []
128+
analyzeFailureResult = null
129+
cliRunnerCallCount = 0
130+
})
131+
132+
afterEach(() => {
133+
fs.rmSync(repoDir, { recursive: true, force: true })
134+
})
135+
136+
// --- Tests ---
137+
138+
describe('runEvalbuff integration', () => {
139+
it('completes one full iteration: runs agent, judges, and logs', async () => {
140+
judgeScores = [8.0] // Above threshold, no doc edit attempted
141+
142+
await runEvalbuff({
143+
repoPath: repoDir,
144+
agentCommand: 'echo',
145+
evalDataPaths: [evalFilePath],
146+
maxIterations: 1,
147+
maxCostUsd: 100,
148+
scoreThreshold: 7.0,
149+
agentTimeoutMs: 10_000,
150+
})
151+
152+
// Verify log was written
153+
const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
154+
expect(fs.existsSync(logPath)).toBe(true)
155+
const logLines = fs
156+
.readFileSync(logPath, 'utf-8')
157+
.trim()
158+
.split('\n')
159+
expect(logLines).toHaveLength(1)
160+
161+
const entry = JSON.parse(logLines[0])
162+
expect(entry.taskId).toBe('task-1')
163+
expect(entry.oldScore).toBe(8.0)
164+
expect(entry.docEdit).toBeNull()
165+
166+
// Verify state was saved
167+
const statePath = path.join(repoDir, 'evalbuff-state.json')
168+
expect(fs.existsSync(statePath)).toBe(true)
169+
const state = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
170+
expect(state.completedTaskIds).toContain('task-1')
171+
172+
// Verify morning report was generated
173+
const reportFiles = fs
174+
.readdirSync(repoDir)
175+
.filter((f) => f.startsWith('evalbuff-report-'))
176+
expect(reportFiles.length).toBeGreaterThan(0)
177+
})
178+
179+
it('attempts doc edit when score is below threshold', async () => {
180+
// First judge call returns low score, second (after doc edit) returns higher
181+
judgeScores = [4.0, 6.0]
182+
analyzeFailureResult = {
183+
reasoning: 'Agent missed error handling patterns',
184+
suggestedDocPath: 'patterns/errors.md',
185+
suggestedContent: '# Error Handling\n\nAlways use try/catch.',
186+
}
187+
188+
await runEvalbuff({
189+
repoPath: repoDir,
190+
agentCommand: 'echo',
191+
evalDataPaths: [evalFilePath],
192+
maxIterations: 1,
193+
maxCostUsd: 100,
194+
scoreThreshold: 7.0,
195+
agentTimeoutMs: 10_000,
196+
})
197+
198+
const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
199+
const entry = JSON.parse(fs.readFileSync(logPath, 'utf-8').trim())
200+
expect(entry.oldScore).toBe(4.0)
201+
expect(entry.newScore).toBe(6.0)
202+
expect(entry.scoreComparison).toBe('improved')
203+
expect(entry.docEdit).not.toBeNull()
204+
expect(entry.docEdit.path).toBe('patterns/errors.md')
205+
206+
// Doc should have been applied to the real repo
207+
const docPath = path.join(repoDir, 'docs', 'patterns', 'errors.md')
208+
expect(fs.existsSync(docPath)).toBe(true)
209+
expect(fs.readFileSync(docPath, 'utf-8')).toContain('Error Handling')
210+
})
211+
212+
it('stops at maxIterations', async () => {
213+
judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0]
214+
215+
await runEvalbuff({
216+
repoPath: repoDir,
217+
agentCommand: 'echo',
218+
evalDataPaths: [evalFilePath], // 5 tasks available
219+
maxIterations: 2,
220+
maxCostUsd: 100,
221+
scoreThreshold: 7.0,
222+
agentTimeoutMs: 10_000,
223+
})
224+
225+
const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
226+
const logLines = fs
227+
.readFileSync(logPath, 'utf-8')
228+
.trim()
229+
.split('\n')
230+
expect(logLines).toHaveLength(2)
231+
232+
const state = JSON.parse(
233+
fs.readFileSync(path.join(repoDir, 'evalbuff-state.json'), 'utf-8'),
234+
)
235+
expect(state.completedTaskIds).toHaveLength(2)
236+
})
237+
238+
it('stops when cost exceeds maxCostUsd', async () => {
239+
judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0]
240+
241+
// First run — complete 1 task, which will accumulate some cost
242+
await runEvalbuff({
243+
repoPath: repoDir,
244+
agentCommand: 'echo',
245+
evalDataPaths: [evalFilePath],
246+
maxIterations: 1,
247+
maxCostUsd: 100,
248+
scoreThreshold: 7.0,
249+
agentTimeoutMs: 10_000,
250+
})
251+
252+
// Manually set cost in state to be at the limit
253+
const statePath = path.join(repoDir, 'evalbuff-state.json')
254+
const state = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
255+
state.totalCostUsd = 100.0
256+
fs.writeFileSync(statePath, JSON.stringify(state))
257+
258+
// Second run — should stop immediately due to cost (>= maxCostUsd)
259+
await runEvalbuff({
260+
repoPath: repoDir,
261+
agentCommand: 'echo',
262+
evalDataPaths: [evalFilePath],
263+
maxIterations: 50,
264+
maxCostUsd: 100,
265+
scoreThreshold: 7.0,
266+
agentTimeoutMs: 10_000,
267+
})
268+
269+
// Should still only have 1 completed task (cost check prevents new tasks)
270+
const finalState = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
271+
expect(finalState.completedTaskIds).toHaveLength(1)
272+
})
273+
274+
it('resumes from state file and skips completed tasks', async () => {
275+
judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0]
276+
277+
// Pre-populate state with 2 completed tasks
278+
const statePath = path.join(repoDir, 'evalbuff-state.json')
279+
fs.writeFileSync(
280+
statePath,
281+
JSON.stringify({
282+
completedTaskIds: ['task-1', 'task-2'],
283+
totalCostUsd: 5.0,
284+
recentScores: [7.0, 8.0],
285+
}),
286+
)
287+
288+
await runEvalbuff({
289+
repoPath: repoDir,
290+
agentCommand: 'echo',
291+
evalDataPaths: [evalFilePath], // 5 tasks
292+
maxIterations: 50,
293+
maxCostUsd: 100,
294+
scoreThreshold: 7.0,
295+
agentTimeoutMs: 10_000,
296+
})
297+
298+
// Should have processed tasks 3-5 (skipped 1 and 2)
299+
const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
300+
const logLines = fs
301+
.readFileSync(logPath, 'utf-8')
302+
.trim()
303+
.split('\n')
304+
expect(logLines).toHaveLength(3)
305+
306+
const taskIds = logLines.map((l) => JSON.parse(l).taskId)
307+
expect(taskIds).toEqual(['task-3', 'task-4', 'task-5'])
308+
309+
const finalState = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
310+
expect(finalState.completedTaskIds).toHaveLength(5)
311+
})
312+
313+
it('reverts doc edit when score does not improve', async () => {
314+
// First judge: low score, second judge: even lower (doc didn't help)
315+
judgeScores = [4.0, 3.0]
316+
analyzeFailureResult = {
317+
reasoning: 'Tried to help',
318+
suggestedDocPath: 'bad-doc.md',
319+
suggestedContent: '# Bad Doc\n\nThis will not help.',
320+
}
321+
322+
await runEvalbuff({
323+
repoPath: repoDir,
324+
agentCommand: 'echo',
325+
evalDataPaths: [evalFilePath],
326+
maxIterations: 1,
327+
maxCostUsd: 100,
328+
scoreThreshold: 7.0,
329+
agentTimeoutMs: 10_000,
330+
})
331+
332+
const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
333+
const entry = JSON.parse(fs.readFileSync(logPath, 'utf-8').trim())
334+
expect(entry.scoreComparison).toBe('worse')
335+
336+
// Doc should NOT exist in the real repo
337+
const docPath = path.join(repoDir, 'docs', 'bad-doc.md')
338+
expect(fs.existsSync(docPath)).toBe(false)
339+
})
340+
})

evals/evalbuff/run-evalbuff.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,10 @@ async function main() {
422422
})
423423
}
424424

425-
main().catch((error) => {
426-
console.error('Evalbuff failed:', error)
427-
process.exit(1)
428-
})
425+
// Only run CLI when executed directly (not when imported)
426+
if (import.meta.main) {
427+
main().catch((error) => {
428+
console.error('Evalbuff failed:', error)
429+
process.exit(1)
430+
})
431+
}

evals/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
"run-buffbench": "bun run buffbench/main.ts",
2525
"run-buffbench-nightly": "bun run buffbench/main-nightly.ts",
2626
"run-evalbuff": "bun run evalbuff/run-evalbuff.ts",
27+
"test:evalbuff": "bun test evalbuff/__tests__/criteria.test.ts evalbuff/__tests__/docs-optimizer.test.ts evalbuff/__tests__/morning-report.test.ts evalbuff/__tests__/cli-runner.test.ts && bun test evalbuff/__tests__/loop.integration.test.ts",
2728
"trigger-buffbench": "bun run scripts/trigger-buffbench.ts",
2829
"setup-codebuff-repo": "bun run setup-codebuff-repo.ts"
2930
},

0 commit comments

Comments
 (0)