Skip to content

Commit 86d3bce

Browse files
jahoomaclaude
andcommitted
evalbuff: add real E2E test runner script
Creates a local git repo with a simple subtract bug, generates an eval task, and runs the full evalbuff loop with real CLI agents. No mocks. Usage: bun run evals/evalbuff/run-e2e-test.ts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 3100dda commit 86d3bce

File tree

1 file changed

+379
-0
lines changed

1 file changed

+379
-0
lines changed

evals/evalbuff/run-e2e-test.ts

Lines changed: 379 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,379 @@
1+
/**
2+
* Real E2E test for evalbuff.
3+
*
4+
* Creates a local git repo with a simple project, generates an eval task,
5+
* and runs the full evalbuff loop with real CLI coding agents and real
6+
* reviewer agents. No mocks.
7+
*
8+
* Prerequisites:
9+
* - `claude` CLI installed and authenticated
10+
* - (Optional) `codex` CLI installed with OPENAI_API_KEY set
11+
*
12+
* Usage:
13+
* bun run evals/evalbuff/run-e2e-test.ts
14+
*/
15+
import { execSync } from 'child_process'
16+
import fs from 'fs'
17+
import os from 'os'
18+
import path from 'path'
19+
20+
import { runEvalbuff } from './run-evalbuff'
21+
22+
import type { ReviewerAgentType } from './judge'
23+
import type { EvalDataV2 } from './types'
24+
25+
// --- Setup ---
26+
27+
const BASE_DIR = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-real-e2e-'))
28+
const PROJECT_DIR = path.join(BASE_DIR, 'project')
29+
const BARE_REPO = path.join(BASE_DIR, 'project.git')
30+
const TARGET_DIR = path.join(BASE_DIR, 'target')
31+
32+
const gitEnv = {
33+
GIT_AUTHOR_NAME: 'evalbuff-test',
34+
GIT_AUTHOR_EMAIL: 'test@evalbuff.dev',
35+
GIT_COMMITTER_NAME: 'evalbuff-test',
36+
GIT_COMMITTER_EMAIL: 'test@evalbuff.dev',
37+
}
38+
39+
function git(cmd: string, cwd: string) {
40+
return execSync(`git ${cmd}`, {
41+
cwd,
42+
encoding: 'utf-8',
43+
stdio: ['ignore', 'pipe', 'pipe'],
44+
env: { ...process.env, ...gitEnv },
45+
}).trim()
46+
}
47+
48+
function setupProject() {
49+
console.log('\n=== Setting up test project ===')
50+
51+
// Create project directory
52+
fs.mkdirSync(PROJECT_DIR, { recursive: true })
53+
git('init', PROJECT_DIR)
54+
55+
// Initial commit: a simple Node.js project with a bug
56+
fs.writeFileSync(
57+
path.join(PROJECT_DIR, 'package.json'),
58+
JSON.stringify(
59+
{
60+
name: 'evalbuff-test-project',
61+
version: '1.0.0',
62+
type: 'module',
63+
scripts: {
64+
test: 'node test.js',
65+
start: 'node index.js',
66+
},
67+
},
68+
null,
69+
2,
70+
),
71+
)
72+
73+
fs.writeFileSync(
74+
path.join(PROJECT_DIR, 'index.js'),
75+
`// Simple math utility
76+
export function add(a, b) {
77+
return a + b
78+
}
79+
80+
export function multiply(a, b) {
81+
return a * b
82+
}
83+
84+
// BUG: subtract is wrong — it adds instead of subtracting
85+
export function subtract(a, b) {
86+
return a + b
87+
}
88+
89+
export function divide(a, b) {
90+
if (b === 0) throw new Error('Division by zero')
91+
return a / b
92+
}
93+
`,
94+
)
95+
96+
fs.writeFileSync(
97+
path.join(PROJECT_DIR, 'test.js'),
98+
`import { add, subtract, multiply, divide } from './index.js'
99+
100+
let passed = 0
101+
let failed = 0
102+
103+
function assert(name, actual, expected) {
104+
if (actual === expected) {
105+
console.log(\` ✓ \${name}\`)
106+
passed++
107+
} else {
108+
console.log(\` ✗ \${name}: expected \${expected}, got \${actual}\`)
109+
failed++
110+
}
111+
}
112+
113+
console.log('Running tests...')
114+
assert('add(2, 3)', add(2, 3), 5)
115+
assert('multiply(3, 4)', multiply(3, 4), 12)
116+
assert('subtract(10, 3)', subtract(10, 3), 7)
117+
assert('divide(10, 2)', divide(10, 2), 5)
118+
119+
try {
120+
divide(1, 0)
121+
console.log(' ✗ divide by zero should throw')
122+
failed++
123+
} catch (e) {
124+
console.log(' ✓ divide by zero throws')
125+
passed++
126+
}
127+
128+
console.log(\`\\n\${passed} passed, \${failed} failed\`)
129+
if (failed > 0) process.exit(1)
130+
`,
131+
)
132+
133+
git('add .', PROJECT_DIR)
134+
git('commit -m "Initial project with bug in subtract"', PROJECT_DIR)
135+
const parentSha = git('rev-parse HEAD', PROJECT_DIR)
136+
137+
console.log(` Parent commit (with bug): ${parentSha.slice(0, 8)}`)
138+
139+
// Now create the ground truth fix
140+
fs.writeFileSync(
141+
path.join(PROJECT_DIR, 'index.js'),
142+
`// Simple math utility
143+
export function add(a, b) {
144+
return a + b
145+
}
146+
147+
export function multiply(a, b) {
148+
return a * b
149+
}
150+
151+
export function subtract(a, b) {
152+
return a - b
153+
}
154+
155+
export function divide(a, b) {
156+
if (b === 0) throw new Error('Division by zero')
157+
return a / b
158+
}
159+
`,
160+
)
161+
162+
git('add .', PROJECT_DIR)
163+
git('commit -m "Fix subtract function"', PROJECT_DIR)
164+
const fixSha = git('rev-parse HEAD', PROJECT_DIR)
165+
166+
console.log(` Fix commit (ground truth): ${fixSha.slice(0, 8)}`)
167+
168+
// Get the diff for the ground truth
169+
const diff = git(`diff ${parentSha} ${fixSha} -- index.js`, PROJECT_DIR)
170+
171+
// Create bare clone for withTestRepo to clone from
172+
execSync(`git clone --bare ${PROJECT_DIR} ${BARE_REPO}`, {
173+
stdio: 'ignore',
174+
env: { ...process.env, ...gitEnv },
175+
})
176+
console.log(` Bare repo created at: ${BARE_REPO}`)
177+
178+
return { parentSha, fixSha, diff }
179+
}
180+
181+
function createEvalFile(parentSha: string, fixSha: string, diff: string) {
182+
console.log('\n=== Creating eval file ===')
183+
184+
const evalData: EvalDataV2 = {
185+
repoUrl: `file://${BARE_REPO}`,
186+
generationDate: new Date().toISOString(),
187+
evalCommits: [
188+
{
189+
id: 'fix-subtract-bug',
190+
sha: fixSha,
191+
parentSha,
192+
spec: 'Fix the subtract function which incorrectly adds instead of subtracting',
193+
prompt:
194+
'The subtract function in index.js has a bug — it adds the two numbers instead of subtracting them. Fix it. Then run the tests to make sure they pass.',
195+
supplementalFiles: ['test.js'],
196+
fileDiffs: [
197+
{
198+
path: 'index.js',
199+
status: 'modified',
200+
diff,
201+
},
202+
],
203+
},
204+
],
205+
}
206+
207+
const evalPath = path.join(BASE_DIR, 'eval.json')
208+
fs.writeFileSync(evalPath, JSON.stringify(evalData, null, 2))
209+
console.log(` Eval file: ${evalPath}`)
210+
return evalPath
211+
}
212+
213+
function setupTargetRepo() {
214+
console.log('\n=== Setting up target repo (for docs output) ===')
215+
216+
fs.mkdirSync(TARGET_DIR, { recursive: true })
217+
git('init', TARGET_DIR)
218+
git('commit --allow-empty -m "init"', TARGET_DIR)
219+
console.log(` Target repo: ${TARGET_DIR}`)
220+
return TARGET_DIR
221+
}
222+
223+
function detectAvailableReviewers(): ReviewerAgentType[] {
224+
const reviewers: ReviewerAgentType[] = []
225+
226+
try {
227+
execSync('which claude', { stdio: 'ignore' })
228+
reviewers.push('claude')
229+
console.log(' ✓ claude CLI found')
230+
} catch {
231+
console.log(' ✗ claude CLI not found')
232+
}
233+
234+
try {
235+
execSync('which codex', { stdio: 'ignore' })
236+
if (process.env.OPENAI_API_KEY) {
237+
reviewers.push('codex')
238+
console.log(' ✓ codex CLI found (OPENAI_API_KEY set)')
239+
} else {
240+
console.log(' ✗ codex CLI found but OPENAI_API_KEY not set')
241+
}
242+
} catch {
243+
console.log(' ✗ codex CLI not found')
244+
}
245+
246+
return reviewers
247+
}
248+
249+
async function main() {
250+
console.log('╔══════════════════════════════════════════╗')
251+
console.log('║ Evalbuff Real E2E Test ║')
252+
console.log('╚══════════════════════════════════════════╝')
253+
console.log(`\nBase dir: ${BASE_DIR}`)
254+
255+
// Detect available agents
256+
console.log('\n=== Detecting available agents ===')
257+
const reviewers = detectAvailableReviewers()
258+
259+
if (reviewers.length === 0) {
260+
console.error('\nNo reviewer agents available. Need at least one of: claude, codex')
261+
process.exit(1)
262+
}
263+
264+
// Detect coding agent
265+
let agentCommand = ''
266+
try {
267+
execSync('which claude', { stdio: 'ignore' })
268+
agentCommand = 'claude --dangerously-skip-permissions -p'
269+
console.log(` Using coding agent: ${agentCommand}`)
270+
} catch {
271+
console.error('\nClaude CLI not found. Install with: npm install -g @anthropic-ai/claude-code')
272+
process.exit(1)
273+
}
274+
275+
// Setup
276+
const { parentSha, fixSha, diff } = setupProject()
277+
const evalPath = createEvalFile(parentSha, fixSha, diff)
278+
const targetDir = setupTargetRepo()
279+
280+
// Run evalbuff
281+
console.log('\n=== Running evalbuff ===')
282+
console.log(` Agent: ${agentCommand}`)
283+
console.log(` Reviewers: ${reviewers.join(', ')}`)
284+
console.log(` Task: fix-subtract-bug`)
285+
console.log('')
286+
287+
const startTime = Date.now()
288+
289+
try {
290+
await runEvalbuff({
291+
repoPath: targetDir,
292+
agentCommand,
293+
evalDataPaths: [evalPath],
294+
maxIterations: 1,
295+
maxCostUsd: 10,
296+
scoreThreshold: 7.0,
297+
agentTimeoutMs: 5 * 60 * 1000, // 5 min for the coding agent
298+
reviewerAgents: reviewers,
299+
})
300+
} catch (error) {
301+
console.error('\nEvalbuff failed:', error)
302+
}
303+
304+
const durationMs = Date.now() - startTime
305+
306+
// Verify results
307+
console.log('\n=== Verifying results ===')
308+
309+
const logPath = path.join(targetDir, 'evalbuff-log.jsonl')
310+
if (fs.existsSync(logPath)) {
311+
const logContent = fs.readFileSync(logPath, 'utf-8').trim()
312+
if (logContent) {
313+
const entries = logContent.split('\n').map((l) => JSON.parse(l))
314+
console.log(` Log entries: ${entries.length}`)
315+
for (const entry of entries) {
316+
console.log(` Task: ${entry.taskId}`)
317+
console.log(` Old score: ${entry.oldScore}`)
318+
console.log(` New score: ${entry.newScore ?? 'N/A'}`)
319+
console.log(` Doc edit: ${entry.docEdit ? entry.docEdit.path : 'none'}`)
320+
console.log(` Score comparison: ${entry.scoreComparison ?? 'N/A'}`)
321+
console.log(` Duration: ${(entry.durationMs / 1000).toFixed(1)}s`)
322+
console.log(` Error: ${entry.error ?? 'none'}`)
323+
}
324+
} else {
325+
console.log(' ✗ Log file is empty')
326+
}
327+
} else {
328+
console.log(' ✗ Log file not found')
329+
}
330+
331+
// Check morning report
332+
const reportFiles = fs
333+
.readdirSync(targetDir)
334+
.filter((f) => f.startsWith('evalbuff-report-'))
335+
if (reportFiles.length > 0) {
336+
console.log(`\n ✓ Morning report: ${reportFiles[0]}`)
337+
const report = fs.readFileSync(
338+
path.join(targetDir, reportFiles[0]),
339+
'utf-8',
340+
)
341+
console.log('\n--- Morning Report ---')
342+
console.log(report)
343+
console.log('--- End Report ---')
344+
} else {
345+
console.log(' ✗ No morning report generated')
346+
}
347+
348+
// Check docs
349+
const docsDir = path.join(targetDir, 'docs')
350+
if (fs.existsSync(docsDir)) {
351+
const docFiles = execSync(`find ${docsDir} -name '*.md'`, {
352+
encoding: 'utf-8',
353+
}).trim()
354+
if (docFiles) {
355+
console.log(`\n ✓ Docs generated:`)
356+
for (const f of docFiles.split('\n')) {
357+
console.log(` ${f}`)
358+
}
359+
}
360+
}
361+
362+
// Check state
363+
const statePath = path.join(targetDir, 'evalbuff-state.json')
364+
if (fs.existsSync(statePath)) {
365+
const state = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
366+
console.log(`\n ✓ State: ${state.completedTaskIds.length} completed, $${state.totalCostUsd.toFixed(2)} spent`)
367+
}
368+
369+
console.log(`\n=== E2E test completed in ${(durationMs / 1000).toFixed(1)}s ===`)
370+
console.log(`Base dir (for inspection): ${BASE_DIR}`)
371+
372+
// Cleanup prompt
373+
console.log(`\nTo clean up: rm -rf ${BASE_DIR}`)
374+
}
375+
376+
main().catch((error) => {
377+
console.error('E2E test failed:', error)
378+
process.exit(1)
379+
})

0 commit comments

Comments
 (0)