|
| 1 | +import { execSync } from 'child_process' |
| 2 | +import fs from 'fs' |
| 3 | +import os from 'os' |
| 4 | +import path from 'path' |
| 5 | + |
| 6 | +import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test' |
| 7 | + |
| 8 | +import type { JudgingResult } from '../judge' |
| 9 | +import type { DocSuggestion } from '../docs-optimizer' |
| 10 | +import type { EvalDataV2 } from '../types' |
| 11 | + |
| 12 | +// --- Mocks --- |
| 13 | + |
| 14 | +// Track calls to mocked functions |
| 15 | +let judgeCallCount = 0 |
| 16 | +let judgeScores: number[] = [] |
| 17 | +let analyzeFailureResult: DocSuggestion | null = null |
| 18 | +let cliRunnerCallCount = 0 |
| 19 | + |
| 20 | +// Mock withTestRepo to use a local temp dir instead of cloning |
| 21 | +mock.module('../test-repo-utils', () => ({ |
| 22 | + withTestRepo: async (_config: any, fn: (cwd: string) => Promise<any>) => { |
| 23 | + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-mock-repo-')) |
| 24 | + execSync('git init && git add . && git commit --allow-empty -m "init"', { |
| 25 | + cwd: dir, |
| 26 | + stdio: 'ignore', |
| 27 | + }) |
| 28 | + try { |
| 29 | + return await fn(dir) |
| 30 | + } finally { |
| 31 | + fs.rmSync(dir, { recursive: true, force: true }) |
| 32 | + } |
| 33 | + }, |
| 34 | +})) |
| 35 | + |
| 36 | +// Mock CLI runner to return a fake result |
| 37 | +mock.module('../cli-runner', () => ({ |
| 38 | + runCliAgent: async () => { |
| 39 | + cliRunnerCallCount++ |
| 40 | + return { |
| 41 | + diff: 'mock diff content', |
| 42 | + durationMs: 1000, |
| 43 | + exitCode: 0, |
| 44 | + stdout: 'mock stdout', |
| 45 | + stderr: '', |
| 46 | + } |
| 47 | + }, |
| 48 | +})) |
| 49 | + |
| 50 | +// Mock judge to return configurable scores |
| 51 | +mock.module('../judge', () => ({ |
| 52 | + judgeCommitResult: async () => { |
| 53 | + const score = judgeScores[judgeCallCount] ?? 5.0 |
| 54 | + judgeCallCount++ |
| 55 | + return { |
| 56 | + analysis: 'Mock analysis', |
| 57 | + strengths: ['Good'], |
| 58 | + weaknesses: ['Could improve'], |
| 59 | + completionScore: score, |
| 60 | + codeQualityScore: score, |
| 61 | + overallScore: score, |
| 62 | + } satisfies JudgingResult |
| 63 | + }, |
| 64 | +})) |
| 65 | + |
| 66 | +// Mock docs-optimizer LLM calls but keep pure functions |
| 67 | +const actualDocsOptimizer = await import('../docs-optimizer') |
| 68 | +mock.module('../docs-optimizer', () => ({ |
| 69 | + ...actualDocsOptimizer, |
| 70 | + analyzeFailure: async () => analyzeFailureResult, |
| 71 | +})) |
| 72 | + |
| 73 | +// Mock CodebuffClient |
| 74 | +mock.module('@codebuff/sdk', () => ({ |
| 75 | + CodebuffClient: class { |
| 76 | + constructor() {} |
| 77 | + async run() { |
| 78 | + return { output: { type: 'text', value: '' } } |
| 79 | + } |
| 80 | + }, |
| 81 | +})) |
| 82 | + |
| 83 | +// Import after mocks are set up |
| 84 | +const { runEvalbuff } = await import('../run-evalbuff') |
| 85 | + |
| 86 | +// --- Test fixtures --- |
| 87 | + |
| 88 | +let repoDir: string |
| 89 | +let evalFilePath: string |
| 90 | + |
| 91 | +function createEvalFile(taskCount: number): string { |
| 92 | + const evalData: EvalDataV2 = { |
| 93 | + repoUrl: 'https://github.com/test/repo', |
| 94 | + generationDate: '2026-03-25', |
| 95 | + evalCommits: Array.from({ length: taskCount }, (_, i) => ({ |
| 96 | + id: `task-${i + 1}`, |
| 97 | + sha: `sha-${i + 1}`, |
| 98 | + parentSha: `parent-${i + 1}`, |
| 99 | + spec: `Test task ${i + 1}`, |
| 100 | + prompt: `Do task ${i + 1}`, |
| 101 | + supplementalFiles: [], |
| 102 | + fileDiffs: [ |
| 103 | + { |
| 104 | + path: `src/file${i + 1}.ts`, |
| 105 | + status: 'modified' as const, |
| 106 | + diff: `@@ -1 +1 @@\n-old\n+new`, |
| 107 | + }, |
| 108 | + ], |
| 109 | + })), |
| 110 | + } |
| 111 | + |
| 112 | + const filePath = path.join(repoDir, `eval-test.json`) |
| 113 | + fs.writeFileSync(filePath, JSON.stringify(evalData)) |
| 114 | + return filePath |
| 115 | +} |
| 116 | + |
| 117 | +beforeEach(() => { |
| 118 | + repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-integ-')) |
| 119 | + execSync('git init && git add . && git commit --allow-empty -m "init"', { |
| 120 | + cwd: repoDir, |
| 121 | + stdio: 'ignore', |
| 122 | + }) |
| 123 | + evalFilePath = createEvalFile(5) |
| 124 | + |
| 125 | + // Reset mock state |
| 126 | + judgeCallCount = 0 |
| 127 | + judgeScores = [] |
| 128 | + analyzeFailureResult = null |
| 129 | + cliRunnerCallCount = 0 |
| 130 | +}) |
| 131 | + |
| 132 | +afterEach(() => { |
| 133 | + fs.rmSync(repoDir, { recursive: true, force: true }) |
| 134 | +}) |
| 135 | + |
| 136 | +// --- Tests --- |
| 137 | + |
| 138 | +describe('runEvalbuff integration', () => { |
| 139 | + it('completes one full iteration: runs agent, judges, and logs', async () => { |
| 140 | + judgeScores = [8.0] // Above threshold, no doc edit attempted |
| 141 | + |
| 142 | + await runEvalbuff({ |
| 143 | + repoPath: repoDir, |
| 144 | + agentCommand: 'echo', |
| 145 | + evalDataPaths: [evalFilePath], |
| 146 | + maxIterations: 1, |
| 147 | + maxCostUsd: 100, |
| 148 | + scoreThreshold: 7.0, |
| 149 | + agentTimeoutMs: 10_000, |
| 150 | + }) |
| 151 | + |
| 152 | + // Verify log was written |
| 153 | + const logPath = path.join(repoDir, 'evalbuff-log.jsonl') |
| 154 | + expect(fs.existsSync(logPath)).toBe(true) |
| 155 | + const logLines = fs |
| 156 | + .readFileSync(logPath, 'utf-8') |
| 157 | + .trim() |
| 158 | + .split('\n') |
| 159 | + expect(logLines).toHaveLength(1) |
| 160 | + |
| 161 | + const entry = JSON.parse(logLines[0]) |
| 162 | + expect(entry.taskId).toBe('task-1') |
| 163 | + expect(entry.oldScore).toBe(8.0) |
| 164 | + expect(entry.docEdit).toBeNull() |
| 165 | + |
| 166 | + // Verify state was saved |
| 167 | + const statePath = path.join(repoDir, 'evalbuff-state.json') |
| 168 | + expect(fs.existsSync(statePath)).toBe(true) |
| 169 | + const state = JSON.parse(fs.readFileSync(statePath, 'utf-8')) |
| 170 | + expect(state.completedTaskIds).toContain('task-1') |
| 171 | + |
| 172 | + // Verify morning report was generated |
| 173 | + const reportFiles = fs |
| 174 | + .readdirSync(repoDir) |
| 175 | + .filter((f) => f.startsWith('evalbuff-report-')) |
| 176 | + expect(reportFiles.length).toBeGreaterThan(0) |
| 177 | + }) |
| 178 | + |
| 179 | + it('attempts doc edit when score is below threshold', async () => { |
| 180 | + // First judge call returns low score, second (after doc edit) returns higher |
| 181 | + judgeScores = [4.0, 6.0] |
| 182 | + analyzeFailureResult = { |
| 183 | + reasoning: 'Agent missed error handling patterns', |
| 184 | + suggestedDocPath: 'patterns/errors.md', |
| 185 | + suggestedContent: '# Error Handling\n\nAlways use try/catch.', |
| 186 | + } |
| 187 | + |
| 188 | + await runEvalbuff({ |
| 189 | + repoPath: repoDir, |
| 190 | + agentCommand: 'echo', |
| 191 | + evalDataPaths: [evalFilePath], |
| 192 | + maxIterations: 1, |
| 193 | + maxCostUsd: 100, |
| 194 | + scoreThreshold: 7.0, |
| 195 | + agentTimeoutMs: 10_000, |
| 196 | + }) |
| 197 | + |
| 198 | + const logPath = path.join(repoDir, 'evalbuff-log.jsonl') |
| 199 | + const entry = JSON.parse(fs.readFileSync(logPath, 'utf-8').trim()) |
| 200 | + expect(entry.oldScore).toBe(4.0) |
| 201 | + expect(entry.newScore).toBe(6.0) |
| 202 | + expect(entry.scoreComparison).toBe('improved') |
| 203 | + expect(entry.docEdit).not.toBeNull() |
| 204 | + expect(entry.docEdit.path).toBe('patterns/errors.md') |
| 205 | + |
| 206 | + // Doc should have been applied to the real repo |
| 207 | + const docPath = path.join(repoDir, 'docs', 'patterns', 'errors.md') |
| 208 | + expect(fs.existsSync(docPath)).toBe(true) |
| 209 | + expect(fs.readFileSync(docPath, 'utf-8')).toContain('Error Handling') |
| 210 | + }) |
| 211 | + |
| 212 | + it('stops at maxIterations', async () => { |
| 213 | + judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0] |
| 214 | + |
| 215 | + await runEvalbuff({ |
| 216 | + repoPath: repoDir, |
| 217 | + agentCommand: 'echo', |
| 218 | + evalDataPaths: [evalFilePath], // 5 tasks available |
| 219 | + maxIterations: 2, |
| 220 | + maxCostUsd: 100, |
| 221 | + scoreThreshold: 7.0, |
| 222 | + agentTimeoutMs: 10_000, |
| 223 | + }) |
| 224 | + |
| 225 | + const logPath = path.join(repoDir, 'evalbuff-log.jsonl') |
| 226 | + const logLines = fs |
| 227 | + .readFileSync(logPath, 'utf-8') |
| 228 | + .trim() |
| 229 | + .split('\n') |
| 230 | + expect(logLines).toHaveLength(2) |
| 231 | + |
| 232 | + const state = JSON.parse( |
| 233 | + fs.readFileSync(path.join(repoDir, 'evalbuff-state.json'), 'utf-8'), |
| 234 | + ) |
| 235 | + expect(state.completedTaskIds).toHaveLength(2) |
| 236 | + }) |
| 237 | + |
| 238 | + it('stops when cost exceeds maxCostUsd', async () => { |
| 239 | + judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0] |
| 240 | + |
| 241 | + // First run — complete 1 task, which will accumulate some cost |
| 242 | + await runEvalbuff({ |
| 243 | + repoPath: repoDir, |
| 244 | + agentCommand: 'echo', |
| 245 | + evalDataPaths: [evalFilePath], |
| 246 | + maxIterations: 1, |
| 247 | + maxCostUsd: 100, |
| 248 | + scoreThreshold: 7.0, |
| 249 | + agentTimeoutMs: 10_000, |
| 250 | + }) |
| 251 | + |
| 252 | + // Manually set cost in state to be at the limit |
| 253 | + const statePath = path.join(repoDir, 'evalbuff-state.json') |
| 254 | + const state = JSON.parse(fs.readFileSync(statePath, 'utf-8')) |
| 255 | + state.totalCostUsd = 100.0 |
| 256 | + fs.writeFileSync(statePath, JSON.stringify(state)) |
| 257 | + |
| 258 | + // Second run — should stop immediately due to cost (>= maxCostUsd) |
| 259 | + await runEvalbuff({ |
| 260 | + repoPath: repoDir, |
| 261 | + agentCommand: 'echo', |
| 262 | + evalDataPaths: [evalFilePath], |
| 263 | + maxIterations: 50, |
| 264 | + maxCostUsd: 100, |
| 265 | + scoreThreshold: 7.0, |
| 266 | + agentTimeoutMs: 10_000, |
| 267 | + }) |
| 268 | + |
| 269 | + // Should still only have 1 completed task (cost check prevents new tasks) |
| 270 | + const finalState = JSON.parse(fs.readFileSync(statePath, 'utf-8')) |
| 271 | + expect(finalState.completedTaskIds).toHaveLength(1) |
| 272 | + }) |
| 273 | + |
| 274 | + it('resumes from state file and skips completed tasks', async () => { |
| 275 | + judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0] |
| 276 | + |
| 277 | + // Pre-populate state with 2 completed tasks |
| 278 | + const statePath = path.join(repoDir, 'evalbuff-state.json') |
| 279 | + fs.writeFileSync( |
| 280 | + statePath, |
| 281 | + JSON.stringify({ |
| 282 | + completedTaskIds: ['task-1', 'task-2'], |
| 283 | + totalCostUsd: 5.0, |
| 284 | + recentScores: [7.0, 8.0], |
| 285 | + }), |
| 286 | + ) |
| 287 | + |
| 288 | + await runEvalbuff({ |
| 289 | + repoPath: repoDir, |
| 290 | + agentCommand: 'echo', |
| 291 | + evalDataPaths: [evalFilePath], // 5 tasks |
| 292 | + maxIterations: 50, |
| 293 | + maxCostUsd: 100, |
| 294 | + scoreThreshold: 7.0, |
| 295 | + agentTimeoutMs: 10_000, |
| 296 | + }) |
| 297 | + |
| 298 | + // Should have processed tasks 3-5 (skipped 1 and 2) |
| 299 | + const logPath = path.join(repoDir, 'evalbuff-log.jsonl') |
| 300 | + const logLines = fs |
| 301 | + .readFileSync(logPath, 'utf-8') |
| 302 | + .trim() |
| 303 | + .split('\n') |
| 304 | + expect(logLines).toHaveLength(3) |
| 305 | + |
| 306 | + const taskIds = logLines.map((l) => JSON.parse(l).taskId) |
| 307 | + expect(taskIds).toEqual(['task-3', 'task-4', 'task-5']) |
| 308 | + |
| 309 | + const finalState = JSON.parse(fs.readFileSync(statePath, 'utf-8')) |
| 310 | + expect(finalState.completedTaskIds).toHaveLength(5) |
| 311 | + }) |
| 312 | + |
| 313 | + it('reverts doc edit when score does not improve', async () => { |
| 314 | + // First judge: low score, second judge: even lower (doc didn't help) |
| 315 | + judgeScores = [4.0, 3.0] |
| 316 | + analyzeFailureResult = { |
| 317 | + reasoning: 'Tried to help', |
| 318 | + suggestedDocPath: 'bad-doc.md', |
| 319 | + suggestedContent: '# Bad Doc\n\nThis will not help.', |
| 320 | + } |
| 321 | + |
| 322 | + await runEvalbuff({ |
| 323 | + repoPath: repoDir, |
| 324 | + agentCommand: 'echo', |
| 325 | + evalDataPaths: [evalFilePath], |
| 326 | + maxIterations: 1, |
| 327 | + maxCostUsd: 100, |
| 328 | + scoreThreshold: 7.0, |
| 329 | + agentTimeoutMs: 10_000, |
| 330 | + }) |
| 331 | + |
| 332 | + const logPath = path.join(repoDir, 'evalbuff-log.jsonl') |
| 333 | + const entry = JSON.parse(fs.readFileSync(logPath, 'utf-8').trim()) |
| 334 | + expect(entry.scoreComparison).toBe('worse') |
| 335 | + |
| 336 | + // Doc should NOT exist in the real repo |
| 337 | + const docPath = path.join(repoDir, 'docs', 'bad-doc.md') |
| 338 | + expect(fs.existsSync(docPath)).toBe(false) |
| 339 | + }) |
| 340 | +}) |
0 commit comments