Skip to content

Commit 3100dda

Browse files
jahoomaclaude
andcommitted
evalbuff: fix reviewer agent invocation and cost tracking
- Write reviewer prompt to file instead of CLI args (avoids length limits) - Use rsync + node_modules symlink instead of cp -r (1.7GB → fast) - Don't pass eval env to reviewers (test API keys break real agents) - Strip API key env vars from coding agent env too - Remove CodebuffClient dependency from orchestrator - Fix cost estimate: was $1/sec, now $0.01/sec - Always log stderr/stdout on reviewer failure - Remove --output-format/--json flags from reviewer commands Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b238947 commit 3100dda

File tree

2 files changed

+49
-18
lines changed

2 files changed

+49
-18
lines changed

evals/evalbuff/judge.ts

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,6 @@ const REVIEWER_CONFIGS: Record<ReviewerAgentType, ReviewerConfig> = {
5858
'claude',
5959
'-p',
6060
'__PROMPT__',
61-
'--output-format',
62-
'stream-json',
6361
'--dangerously-skip-permissions',
6462
],
6563
timeoutMs: 30 * 60 * 1000, // 30 min — needs time for E2E testing
@@ -70,7 +68,6 @@ const REVIEWER_CONFIGS: Record<ReviewerAgentType, ReviewerConfig> = {
7068
'codex',
7169
'exec',
7270
'--full-auto',
73-
'--json',
7471
'-m',
7572
'gpt-5.1-codex',
7673
'__PROMPT__',
@@ -180,9 +177,14 @@ All scores are 0-10. The e2eScore specifically measures how well the change work
180177
IMPORTANT: You MUST write the result file. This is the only way your review gets recorded. Do it as your very last action.`
181178
}
182179

180+
const PROMPT_FILE_NAME = 'EVALBUFF_REVIEW_PROMPT.md'
181+
182+
const BOOTSTRAP_PROMPT = `Read the file ${PROMPT_FILE_NAME} in the current directory and follow all instructions in it exactly. The file contains a code review task. After your review and testing, you MUST write your judgment to ${RESULT_FILE_NAME} as specified in the prompt file.`
183+
183184
/**
184185
* Run a single reviewer agent in the given repo directory.
185-
* The agent writes its judgment to a JSON file which we parse.
186+
* Writes the full prompt to a file in the repo, then gives the agent
187+
* a short bootstrap prompt to read it (avoids CLI arg length limits).
186188
*/
187189
async function runReviewerAgent(
188190
agentType: ReviewerAgentType,
@@ -191,9 +193,13 @@ async function runReviewerAgent(
191193
env?: Record<string, string>,
192194
): Promise<JudgingResult | null> {
193195
const config = REVIEWER_CONFIGS[agentType]
196+
197+
// Write the full prompt to a file in the repo
198+
fs.writeFileSync(path.join(cwd, PROMPT_FILE_NAME), prompt)
199+
194200
const args = config.command
195201
.slice(1)
196-
.map((a) => (a === '__PROMPT__' ? prompt : a))
202+
.map((a) => (a === '__PROMPT__' ? BOOTSTRAP_PROMPT : a))
197203

198204
const cmd = config.command[0]
199205

@@ -240,6 +246,14 @@ async function runReviewerAgent(
240246
console.log(
241247
`[Reviewer:${agentType}] Exited with code ${code}`,
242248
)
249+
if (code !== 0) {
250+
console.warn(
251+
`[Reviewer:${agentType}] stderr (last 1000 chars): ${stderr.slice(-1000)}`,
252+
)
253+
console.warn(
254+
`[Reviewer:${agentType}] stdout (last 500 chars): ${stdout.slice(-500)}`,
255+
)
256+
}
243257

244258
// Try to read the result file the agent wrote
245259
const resultPath = path.join(cwd, RESULT_FILE_NAME)
@@ -408,8 +422,20 @@ export async function judgeCommitResult(
408422
// Each reviewer gets its own copy of the repo so they don't interfere
409423
const reviewDir = `${repoDir}-review-${agentType}`
410424
try {
411-
execSync(`cp -r ${repoDir} ${reviewDir}`, { stdio: 'ignore' })
412-
return await runReviewerAgent(agentType, prompt, reviewDir, env)
425+
// Fast copy: use rsync to exclude heavy dirs, then symlink them
426+
const nodeModulesPath = path.join(repoDir, 'node_modules')
427+
const hasNodeModules = fs.existsSync(nodeModulesPath)
428+
if (hasNodeModules) {
429+
execSync(
430+
`rsync -a --exclude node_modules "${repoDir}/" "${reviewDir}/"`,
431+
{ stdio: 'ignore' },
432+
)
433+
fs.symlinkSync(nodeModulesPath, path.join(reviewDir, 'node_modules'))
434+
} else {
435+
execSync(`cp -r "${repoDir}" "${reviewDir}"`, { stdio: 'ignore' })
436+
}
437+
// Don't pass eval env to reviewers — they need real API keys, not test ones
438+
return await runReviewerAgent(agentType, prompt, reviewDir)
413439
} finally {
414440
try {
415441
fs.rmSync(reviewDir, { recursive: true, force: true })

evals/evalbuff/run-evalbuff.ts

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ import { execSync } from 'child_process'
22
import fs from 'fs'
33
import path from 'path'
44

5-
import { CodebuffClient } from '@codebuff/sdk'
6-
75
import { runCliAgent } from './cli-runner'
86
import {
97
getCriteriaForLevel,
@@ -133,15 +131,25 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
133131

134132
const statePath = path.join(repoPath, 'evalbuff-state.json')
135133
const logPath = path.join(repoPath, 'evalbuff-log.jsonl')
134+
135+
// Strip API key env vars — eval data provides test keys for init commands
136+
// but agents need their real API keys to function
137+
const API_KEY_PATTERN = /(_KEY|_SECRET|_TOKEN|_API_KEY)$/i
138+
const stripApiKeys = (env?: Record<string, string>) => {
139+
if (!env) return undefined
140+
return Object.fromEntries(
141+
Object.entries(env).filter(([k]) => !API_KEY_PATTERN.test(k)),
142+
)
143+
}
144+
const safeEnv = (evalData: { env?: Record<string, string> }) =>
145+
stripApiKeys(evalData.env)
136146
const defaultCriteriaPath =
137147
criteriaPath || path.join(repoPath, 'evalbuff-criteria.json')
138148

139149
const state = loadState(statePath)
140150
let criteria = loadCriteria(defaultCriteriaPath)
141151
const tasks = loadEvalTasks(evalDataPaths)
142152

143-
// CodebuffClient is only used for doc writer (analyzeFailure), not for judging
144-
const client = new CodebuffClient({})
145153

146154
console.log(`Evalbuff starting:`)
147155
console.log(` Repo: ${repoPath}`)
@@ -212,11 +220,11 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
212220
prompt: task.prompt,
213221
cwd: repoDir,
214222
timeoutMs: agentTimeoutMs,
215-
env: evalData.env,
223+
env: safeEnv(evalData),
216224
})
217225

218226
const contextFiles = getContextFiles(repoDir, task)
219-
logEntry.costUsd += result.durationMs * 0.001
227+
logEntry.costUsd += result.durationMs * 0.00001 // ~$0.01/sec rough estimate
220228

221229
// Judge the result — reviewer agents run IN the repo
222230
// so they can build, test, start the app, use browser tools, etc.
@@ -229,7 +237,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
229237
error: result.exitCode !== 0 ? result.stderr : undefined,
230238
criteria,
231239
reviewerAgents,
232-
env: evalData.env,
233240
})
234241

235242
return judging
@@ -250,7 +257,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
250257
const currentDocs = readCurrentDocs(repoPath)
251258

252259
const docSuggestion = await analyzeFailure({
253-
client,
254260
judgeResult: oldJudging,
255261
taskPrompt: task.prompt,
256262
agentDiff: '', // agent diff not preserved after withTestRepo cleanup
@@ -290,11 +296,11 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
290296
prompt: task.prompt,
291297
cwd: freshRepoDir,
292298
timeoutMs: agentTimeoutMs,
293-
env: evalData.env,
299+
env: safeEnv(evalData),
294300
})
295301

296302
const contextFiles = getContextFiles(freshRepoDir, task)
297-
logEntry.costUsd += result.durationMs * 0.001
303+
logEntry.costUsd += result.durationMs * 0.00001 // ~$0.01/sec rough estimate
298304

299305
console.log(`Re-judging with reviewer agents...`)
300306
return await judgeCommitResult({
@@ -305,7 +311,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
305311
error: result.exitCode !== 0 ? result.stderr : undefined,
306312
criteria,
307313
reviewerAgents,
308-
env: evalData.env,
309314
})
310315
},
311316
)

0 commit comments

Comments
 (0)