evalbuff: fix reviewer agent invocation and cost tracking

jahooma · claude · jahooma · commit 3100dda8a36e · 2026-03-26T14:30:55.000-07:00
- Write reviewer prompt to file instead of CLI args (avoids length limits)
- Use rsync + node_modules symlink instead of cp -r (1.7GB → fast)
- Don't pass eval env to reviewers (test API keys break real agents)
- Strip API key env vars from coding agent env too
- Remove CodebuffClient dependency from orchestrator
- Fix cost estimate: was $1/sec, now $0.01/sec
- Always log stderr/stdout on reviewer failure
- Remove --output-format/--json flags from reviewer commands

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/evals/evalbuff/judge.ts b/evals/evalbuff/judge.ts
@@ -58,8 +58,6 @@ const REVIEWER_CONFIGS: Record<ReviewerAgentType, ReviewerConfig> = {
       'claude',
       '-p',
       '__PROMPT__',
-      '--output-format',
-      'stream-json',
       '--dangerously-skip-permissions',
     ],
     timeoutMs: 30 * 60 * 1000, // 30 min — needs time for E2E testing
@@ -70,7 +68,6 @@ const REVIEWER_CONFIGS: Record<ReviewerAgentType, ReviewerConfig> = {
       'codex',
       'exec',
       '--full-auto',
-      '--json',
       '-m',
       'gpt-5.1-codex',
       '__PROMPT__',
@@ -180,9 +177,14 @@ All scores are 0-10. The e2eScore specifically measures how well the change work
 IMPORTANT: You MUST write the result file. This is the only way your review gets recorded. Do it as your very last action.`
 }
 
+const PROMPT_FILE_NAME = 'EVALBUFF_REVIEW_PROMPT.md'
+
+const BOOTSTRAP_PROMPT = `Read the file ${PROMPT_FILE_NAME} in the current directory and follow all instructions in it exactly. The file contains a code review task. After your review and testing, you MUST write your judgment to ${RESULT_FILE_NAME} as specified in the prompt file.`
+
 /**
  * Run a single reviewer agent in the given repo directory.
- * The agent writes its judgment to a JSON file which we parse.
+ * Writes the full prompt to a file in the repo, then gives the agent
+ * a short bootstrap prompt to read it (avoids CLI arg length limits).
  */
 async function runReviewerAgent(
   agentType: ReviewerAgentType,
@@ -191,9 +193,13 @@ async function runReviewerAgent(
   env?: Record<string, string>,
 ): Promise<JudgingResult | null> {
   const config = REVIEWER_CONFIGS[agentType]
+
+  // Write the full prompt to a file in the repo
+  fs.writeFileSync(path.join(cwd, PROMPT_FILE_NAME), prompt)
+
   const args = config.command
     .slice(1)
-    .map((a) => (a === '__PROMPT__' ? prompt : a))
+    .map((a) => (a === '__PROMPT__' ? BOOTSTRAP_PROMPT : a))
 
   const cmd = config.command[0]
 
@@ -240,6 +246,14 @@ async function runReviewerAgent(
       console.log(
         `[Reviewer:${agentType}] Exited with code ${code}`,
       )
+      if (code !== 0) {
+        console.warn(
+          `[Reviewer:${agentType}] stderr (last 1000 chars): ${stderr.slice(-1000)}`,
+        )
+        console.warn(
+          `[Reviewer:${agentType}] stdout (last 500 chars): ${stdout.slice(-500)}`,
+        )
+      }
 
       // Try to read the result file the agent wrote
       const resultPath = path.join(cwd, RESULT_FILE_NAME)
@@ -408,8 +422,20 @@ export async function judgeCommitResult(
     // Each reviewer gets its own copy of the repo so they don't interfere
     const reviewDir = `${repoDir}-review-${agentType}`
     try {
-      execSync(`cp -r ${repoDir} ${reviewDir}`, { stdio: 'ignore' })
-      return await runReviewerAgent(agentType, prompt, reviewDir, env)
+      // Fast copy: use rsync to exclude heavy dirs, then symlink them
+      const nodeModulesPath = path.join(repoDir, 'node_modules')
+      const hasNodeModules = fs.existsSync(nodeModulesPath)
+      if (hasNodeModules) {
+        execSync(
+          `rsync -a --exclude node_modules "${repoDir}/" "${reviewDir}/"`,
+          { stdio: 'ignore' },
+        )
+        fs.symlinkSync(nodeModulesPath, path.join(reviewDir, 'node_modules'))
+      } else {
+        execSync(`cp -r "${repoDir}" "${reviewDir}"`, { stdio: 'ignore' })
+      }
+      // Don't pass eval env to reviewers — they need real API keys, not test ones
+      return await runReviewerAgent(agentType, prompt, reviewDir)
     } finally {
       try {
         fs.rmSync(reviewDir, { recursive: true, force: true })
diff --git a/evals/evalbuff/run-evalbuff.ts b/evals/evalbuff/run-evalbuff.ts
@@ -2,8 +2,6 @@ import { execSync } from 'child_process'
 import fs from 'fs'
 import path from 'path'
 
-import { CodebuffClient } from '@codebuff/sdk'
-
 import { runCliAgent } from './cli-runner'
 import {
   getCriteriaForLevel,
@@ -133,15 +131,25 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
 
   const statePath = path.join(repoPath, 'evalbuff-state.json')
   const logPath = path.join(repoPath, 'evalbuff-log.jsonl')
+
+  // Strip API key env vars — eval data provides test keys for init commands
+  // but agents need their real API keys to function
+  const API_KEY_PATTERN = /(_KEY|_SECRET|_TOKEN|_API_KEY)$/i
+  const stripApiKeys = (env?: Record<string, string>) => {
+    if (!env) return undefined
+    return Object.fromEntries(
+      Object.entries(env).filter(([k]) => !API_KEY_PATTERN.test(k)),
+    )
+  }
+  const safeEnv = (evalData: { env?: Record<string, string> }) =>
+    stripApiKeys(evalData.env)
   const defaultCriteriaPath =
     criteriaPath || path.join(repoPath, 'evalbuff-criteria.json')
 
   const state = loadState(statePath)
   let criteria = loadCriteria(defaultCriteriaPath)
   const tasks = loadEvalTasks(evalDataPaths)
 
-  // CodebuffClient is only used for doc writer (analyzeFailure), not for judging
-  const client = new CodebuffClient({})
 
   console.log(`Evalbuff starting:`)
   console.log(`  Repo: ${repoPath}`)
@@ -212,11 +220,11 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
             prompt: task.prompt,
             cwd: repoDir,
             timeoutMs: agentTimeoutMs,
-            env: evalData.env,
+            env: safeEnv(evalData),
           })
 
           const contextFiles = getContextFiles(repoDir, task)
-          logEntry.costUsd += result.durationMs * 0.001
+          logEntry.costUsd += result.durationMs * 0.00001 // ~$0.01/sec rough estimate
 
           // Judge the result — reviewer agents run IN the repo
           // so they can build, test, start the app, use browser tools, etc.
@@ -229,7 +237,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
             error: result.exitCode !== 0 ? result.stderr : undefined,
             criteria,
             reviewerAgents,
-            env: evalData.env,
           })
 
           return judging
@@ -250,7 +257,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
         const currentDocs = readCurrentDocs(repoPath)
 
         const docSuggestion = await analyzeFailure({
-          client,
           judgeResult: oldJudging,
           taskPrompt: task.prompt,
           agentDiff: '', // agent diff not preserved after withTestRepo cleanup
@@ -290,11 +296,11 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
                 prompt: task.prompt,
                 cwd: freshRepoDir,
                 timeoutMs: agentTimeoutMs,
-                env: evalData.env,
+                env: safeEnv(evalData),
               })
 
               const contextFiles = getContextFiles(freshRepoDir, task)
-              logEntry.costUsd += result.durationMs * 0.001
+              logEntry.costUsd += result.durationMs * 0.00001 // ~$0.01/sec rough estimate
 
               console.log(`Re-judging with reviewer agents...`)
               return await judgeCommitResult({
@@ -305,7 +311,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
                 error: result.exitCode !== 0 ? result.stderr : undefined,
                 criteria,
                 reviewerAgents,
-                env: evalData.env,
               })
             },
           )