|
| 1 | +/** |
| 2 | + * E2E test for evalbuff. |
| 3 | + * |
| 4 | + * This test runs the full evalbuff loop with a real (mock) agent on a local |
| 5 | + * git repo with synthetic eval tasks. It verifies: |
| 6 | + * - The morning report is generated |
| 7 | + * - Log entries are written |
| 8 | + * - State file tracks completed tasks |
| 9 | + * - Doc edits are committed to the repo when they improve scores |
| 10 | + * |
| 11 | + * This test uses mock.module to replace LLM calls but runs the full |
| 12 | + * orchestrator, CLI runner, and git operations for real. |
| 13 | + * |
| 14 | + * Run: bun test evals/evalbuff/__tests__/e2e.test.ts |
| 15 | + */ |
| 16 | +import { execSync } from 'child_process' |
| 17 | +import fs from 'fs' |
| 18 | +import os from 'os' |
| 19 | +import path from 'path' |
| 20 | + |
| 21 | +import { afterAll, beforeAll, describe, expect, it, mock } from 'bun:test' |
| 22 | + |
| 23 | +import type { JudgingResult } from '../judge' |
| 24 | +import type { DocSuggestion } from '../docs-optimizer' |
| 25 | +import type { EvalDataV2 } from '../types' |
| 26 | + |
| 27 | +// --- Mocks for LLM calls only --- |
| 28 | + |
| 29 | +let judgeCallCount = 0 |
| 30 | + |
| 31 | +mock.module('../test-repo-utils', () => ({ |
| 32 | + withTestRepo: async (_config: any, fn: (cwd: string) => Promise<any>) => { |
| 33 | + // Create a real local git repo for each call |
| 34 | + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-repo-')) |
| 35 | + execSync('git init && git add . && git commit --allow-empty -m "init"', { |
| 36 | + cwd: dir, |
| 37 | + stdio: 'ignore', |
| 38 | + env: { ...process.env, GIT_AUTHOR_NAME: 'test', GIT_AUTHOR_EMAIL: 'test@test.com', GIT_COMMITTER_NAME: 'test', GIT_COMMITTER_EMAIL: 'test@test.com' }, |
| 39 | + }) |
| 40 | + try { |
| 41 | + return await fn(dir) |
| 42 | + } finally { |
| 43 | + fs.rmSync(dir, { recursive: true, force: true }) |
| 44 | + } |
| 45 | + }, |
| 46 | +})) |
| 47 | + |
| 48 | +// Judge returns alternating scores: low (triggers doc edit), then higher (confirms improvement) |
| 49 | +mock.module('../judge', () => ({ |
| 50 | + judgeCommitResult: async () => { |
| 51 | + const scores = [3.0, 6.0, 8.5, 5.0, 7.0, 9.0] |
| 52 | + const score = scores[judgeCallCount % scores.length] |
| 53 | + judgeCallCount++ |
| 54 | + return { |
| 55 | + analysis: `Mock analysis for call ${judgeCallCount}`, |
| 56 | + strengths: ['Correctly identified the problem'], |
| 57 | + weaknesses: ['Missing error handling', 'No tests added'], |
| 58 | + completionScore: score, |
| 59 | + codeQualityScore: score, |
| 60 | + overallScore: score, |
| 61 | + } satisfies JudgingResult |
| 62 | + }, |
| 63 | +})) |
| 64 | + |
| 65 | +const actualDocsOptimizer = await import('../docs-optimizer') |
| 66 | +mock.module('../docs-optimizer', () => ({ |
| 67 | + ...actualDocsOptimizer, |
| 68 | + analyzeFailure: async () => |
| 69 | + ({ |
| 70 | + reasoning: 'Agent consistently misses error handling patterns in async code', |
| 71 | + suggestedDocPath: 'patterns/async-error-handling.md', |
| 72 | + suggestedContent: |
| 73 | + '# Async Error Handling\n\nAll async functions should use try/catch blocks.\nPropagate errors with meaningful messages.\n\n## Examples\n\n```ts\nasync function fetchData() {\n try {\n const result = await api.get("/data")\n return result\n } catch (error) {\n throw new Error(`Failed to fetch data: ${error.message}`)\n }\n}\n```\n', |
| 74 | + }) satisfies DocSuggestion, |
| 75 | +})) |
| 76 | + |
| 77 | +mock.module('@codebuff/sdk', () => ({ |
| 78 | + CodebuffClient: class { |
| 79 | + constructor() {} |
| 80 | + }, |
| 81 | +})) |
| 82 | + |
| 83 | +const { runEvalbuff } = await import('../run-evalbuff') |
| 84 | + |
| 85 | +// --- Test setup --- |
| 86 | + |
| 87 | +let repoDir: string |
| 88 | +let evalFilePath: string |
| 89 | + |
| 90 | +beforeAll(() => { |
| 91 | + // Create a "target repo" where docs will be written |
| 92 | + repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-target-')) |
| 93 | + execSync('git init && git add . && git commit --allow-empty -m "init"', { |
| 94 | + cwd: repoDir, |
| 95 | + stdio: 'ignore', |
| 96 | + env: { ...process.env, GIT_AUTHOR_NAME: 'test', GIT_AUTHOR_EMAIL: 'test@test.com', GIT_COMMITTER_NAME: 'test', GIT_COMMITTER_EMAIL: 'test@test.com' }, |
| 97 | + }) |
| 98 | + |
| 99 | + // Create eval file with 3 tasks |
| 100 | + const evalData: EvalDataV2 = { |
| 101 | + repoUrl: 'https://github.com/test/repo', |
| 102 | + generationDate: '2026-03-25', |
| 103 | + evalCommits: [ |
| 104 | + { |
| 105 | + id: 'e2e-task-1', |
| 106 | + sha: 'aaa111', |
| 107 | + parentSha: 'aaa000', |
| 108 | + spec: 'Add error handling to fetchData', |
| 109 | + prompt: 'Add try/catch error handling to the fetchData function in src/api.ts', |
| 110 | + supplementalFiles: [], |
| 111 | + fileDiffs: [ |
| 112 | + { |
| 113 | + path: 'src/api.ts', |
| 114 | + status: 'modified', |
| 115 | + diff: '@@ -5,3 +5,7 @@\n-const data = await fetch(url)\n+try {\n+ const data = await fetch(url)\n+} catch (e) {\n+ throw new Error(`Fetch failed: ${e.message}`)\n+}', |
| 116 | + }, |
| 117 | + ], |
| 118 | + }, |
| 119 | + { |
| 120 | + id: 'e2e-task-2', |
| 121 | + sha: 'bbb222', |
| 122 | + parentSha: 'bbb000', |
| 123 | + spec: 'Add input validation', |
| 124 | + prompt: 'Add input validation to the createUser endpoint', |
| 125 | + supplementalFiles: [], |
| 126 | + fileDiffs: [ |
| 127 | + { |
| 128 | + path: 'src/routes/users.ts', |
| 129 | + status: 'modified', |
| 130 | + diff: '@@ -1 +1,5 @@\n+if (!name || !email) {\n+ throw new Error("name and email required")\n+}', |
| 131 | + }, |
| 132 | + ], |
| 133 | + }, |
| 134 | + { |
| 135 | + id: 'e2e-task-3', |
| 136 | + sha: 'ccc333', |
| 137 | + parentSha: 'ccc000', |
| 138 | + spec: 'Refactor logger', |
| 139 | + prompt: 'Refactor the logger to use structured JSON output', |
| 140 | + supplementalFiles: [], |
| 141 | + fileDiffs: [ |
| 142 | + { |
| 143 | + path: 'src/logger.ts', |
| 144 | + status: 'modified', |
| 145 | + diff: '@@ -1 +1,3 @@\n-console.log(msg)\n+const entry = { timestamp: Date.now(), message: msg }\n+process.stdout.write(JSON.stringify(entry) + "\\n")', |
| 146 | + }, |
| 147 | + ], |
| 148 | + }, |
| 149 | + ], |
| 150 | + } |
| 151 | + |
| 152 | + evalFilePath = path.join(repoDir, 'eval-e2e.json') |
| 153 | + fs.writeFileSync(evalFilePath, JSON.stringify(evalData)) |
| 154 | + |
| 155 | + judgeCallCount = 0 |
| 156 | +}) |
| 157 | + |
| 158 | +afterAll(() => { |
| 159 | + fs.rmSync(repoDir, { recursive: true, force: true }) |
| 160 | +}) |
| 161 | + |
| 162 | +// --- E2E tests --- |
| 163 | + |
| 164 | +describe('evalbuff E2E', () => { |
| 165 | + it('runs full loop: agent, judge, doc edit, morning report', async () => { |
| 166 | + await runEvalbuff({ |
| 167 | + repoPath: repoDir, |
| 168 | + agentCommand: 'echo', // echo just prints the prompt and exits |
| 169 | + evalDataPaths: [evalFilePath], |
| 170 | + maxIterations: 3, |
| 171 | + maxCostUsd: 50, |
| 172 | + scoreThreshold: 7.0, |
| 173 | + agentTimeoutMs: 10_000, |
| 174 | + }) |
| 175 | + |
| 176 | + // 1. Morning report exists |
| 177 | + const reportFiles = fs |
| 178 | + .readdirSync(repoDir) |
| 179 | + .filter((f) => f.startsWith('evalbuff-report-')) |
| 180 | + expect(reportFiles.length).toBe(1) |
| 181 | + const report = fs.readFileSync( |
| 182 | + path.join(repoDir, reportFiles[0]), |
| 183 | + 'utf-8', |
| 184 | + ) |
| 185 | + expect(report).toContain('# Evalbuff Morning Report') |
| 186 | + expect(report).toContain('Iterations | 3') |
| 187 | + |
| 188 | + // 2. Log has 3 entries |
| 189 | + const logPath = path.join(repoDir, 'evalbuff-log.jsonl') |
| 190 | + expect(fs.existsSync(logPath)).toBe(true) |
| 191 | + const logLines = fs |
| 192 | + .readFileSync(logPath, 'utf-8') |
| 193 | + .trim() |
| 194 | + .split('\n') |
| 195 | + expect(logLines).toHaveLength(3) |
| 196 | + |
| 197 | + // 3. State tracks all 3 completed tasks |
| 198 | + const statePath = path.join(repoDir, 'evalbuff-state.json') |
| 199 | + const state = JSON.parse(fs.readFileSync(statePath, 'utf-8')) |
| 200 | + expect(state.completedTaskIds).toEqual([ |
| 201 | + 'e2e-task-1', |
| 202 | + 'e2e-task-2', |
| 203 | + 'e2e-task-3', |
| 204 | + ]) |
| 205 | + |
| 206 | + // 4. At least one doc was written (first task scores 3.0, below threshold) |
| 207 | + const docsDir = path.join(repoDir, 'docs') |
| 208 | + expect(fs.existsSync(docsDir)).toBe(true) |
| 209 | + |
| 210 | + // 5. AGENTS.md was created with TOC |
| 211 | + const agentsMdPath = path.join(repoDir, 'AGENTS.md') |
| 212 | + expect(fs.existsSync(agentsMdPath)).toBe(true) |
| 213 | + const agentsMd = fs.readFileSync(agentsMdPath, 'utf-8') |
| 214 | + expect(agentsMd).toContain('async-error-handling.md') |
| 215 | + |
| 216 | + // 6. Doc edits were committed to git |
| 217 | + const gitLog = execSync('git log --oneline', { |
| 218 | + cwd: repoDir, |
| 219 | + encoding: 'utf-8', |
| 220 | + }) |
| 221 | + expect(gitLog).toContain('evalbuff:') |
| 222 | + |
| 223 | + // 7. Log entries have correct task IDs |
| 224 | + const parsedEntries = logLines.map((l) => JSON.parse(l)) |
| 225 | + expect(parsedEntries.map((e: any) => e.taskId)).toEqual([ |
| 226 | + 'e2e-task-1', |
| 227 | + 'e2e-task-2', |
| 228 | + 'e2e-task-3', |
| 229 | + ]) |
| 230 | + }) |
| 231 | +}) |
0 commit comments