Improve evalbuff prompt generation: full file context, buffbench-style prompts

jahooma · claude · jahooma · commit d5b2a92b2b3a · 2026-03-26T23:48:50.000-07:00
- Read full file contents at parent commit (up to 500K) to give the prompt
  generator rich context about the codebase, matching buffbench's approach
- Include the complete diff (up to 200K chars) instead of truncating at 8K
- Rewrite system prompt to produce human-like prompts: high-level functional
  requirements, natural language, no file paths unless a human would mention them
- Skip commits with diffs &gt;200K instead of &gt;50K

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/evalbuff/src/commit-task-generator.ts b/evalbuff/src/commit-task-generator.ts
@@ -12,6 +12,8 @@ export interface CommitTask {
   filesChanged: string[]
 }
 
+const MAX_DIFF_CHARS = 200_000
+
 /**
  * Get a list of commits from the repo, oldest first.
  * Starts from `startAfterSha` (exclusive) or HEAD~commitCount if no state.
@@ -87,41 +89,125 @@ export function getCommitInfo(
 }
 
 /**
- * Generate a human-like task prompt from a commit's message and diff.
- * Uses Claude CLI to rephrase the commit into a natural coding task.
+ * Read a file's content at a specific commit SHA.
+ * Returns null if the file doesn't exist at that commit.
  */
-export async function generatePromptFromCommit(
-  message: string,
-  diff: string,
+function readFileAtCommit(
+  repoPath: string,
+  sha: string,
+  filePath: string,
+): string | null {
+  try {
+    return execSync(`git show ${sha}:${JSON.stringify(filePath)}`, {
+      cwd: repoPath,
+      encoding: 'utf-8',
+      maxBuffer: 10 * 1024 * 1024,
+    })
+  } catch {
+    return null
+  }
+}
+
+/**
+ * Read the full contents of all files being modified at the parent commit.
+ * This gives the prompt generator context about what the code looks like
+ * before the change, so it can write a realistic human prompt.
+ */
+function readFilesAtParent(
+  repoPath: string,
+  parentSha: string,
   filesChanged: string[],
-): Promise<string> {
-  const systemPrompt = `You are generating a task prompt that a developer might write to ask a coding agent to make changes to a codebase. You'll be given a git commit message and diff. Your job is to write a natural, human-sounding prompt that would lead an agent to make similar changes.
+): Record<string, string> {
+  const files: Record<string, string> = {}
+  let totalSize = 0
+  const maxTotalSize = 500_000 // 500K total for all files
+
+  for (const filePath of filesChanged) {
+    if (totalSize >= maxTotalSize) break
+
+    const content = readFileAtCommit(repoPath, parentSha, filePath)
+    if (content != null && content.length > 0) {
+      files[filePath] = content
+      totalSize += content.length
+    }
+  }
+
+  return files
+}
+
+const PROMPT_GEN_SYSTEM = `You are generating a task prompt that a human developer would realistically write to ask an AI coding agent to make changes to their codebase.
+
+You will receive:
+- A git diff showing exactly what was changed
+- The full contents of all files being modified (as they looked BEFORE the change)
+- The commit message (as a hint, but don't just copy it)
+
+Your job is to write a natural, human-sounding prompt — the kind of thing a developer would type into a chat with an AI assistant.
+
+## Key Principles
+
+1. Focus on high-level functional requirements, not implementation details
+   - GOOD: "add user authentication to the API"
+   - BAD: "implement an authenticateUser function in src/auth/middleware.ts"
+
+2. Use natural language — like a Slack message or ticket description
+   - GOOD: "the nightly CI is pointing at the wrong directory, it should be agents not .agents"
+   - BAD: "Update the directory reference in .github/workflows/nightly-e2e.yml from .agents to agents"
+
+3. Describe what you WANT or what's WRONG, not how to fix it
+   - GOOD: "the hover state on buttons looks broken"
+   - BAD: "change the CSS hover opacity from 0.5 to 0.8 in Button.tsx"
+
+4. Don't reference specific file paths unless a human naturally would. Humans describe the feature area, not the file tree.
+   - GOOD: "our login page needs to redirect to freebuff.com instead of codebuff.com"
+   - BAD: "update src/auth/login.ts, src/config/urls.ts, and tests/auth.test.ts to change codebuff.com to freebuff.com"
 
-## Rules
+5. Don't over-specify. Leave room for the agent to figure out the implementation.
 
-1. Write as if you're a developer describing what you want done — NOT as if you've seen the solution
-2. Be vague enough that the agent has to figure out the implementation details, but specific enough about the desired outcome
-3. Do NOT mention specific line numbers, exact variable names from the diff, or implementation details
-4. DO mention the general area of the codebase, the feature/bug, and the desired behavior
-5. Keep it to 1-4 sentences
-6. Sound natural — like a Slack message or a ticket description, not a formal spec
+6. Keep it to 1-4 sentences.
+
+7. Read the FULL file contents to understand context. The diff alone can be misleading — understanding the surrounding code helps you write a prompt that makes sense for this codebase.
 
 ## Output
 
-Respond with ONLY the prompt text, nothing else.`
+Respond with ONLY the prompt text. No quotes, no preamble, no explanation.`
 
-  const userPrompt = `Commit message: ${message}
+/**
+ * Generate a human-like task prompt from a commit.
+ * Reads the full files at the parent commit for context, similar to how
+ * buffbench uses file-explorer agents to understand the codebase.
+ */
+export async function generatePromptFromCommit(
+  repoPath: string,
+  parentSha: string,
+  message: string,
+  diff: string,
+  filesChanged: string[],
+): Promise<string> {
+  // Read full file contents at the parent commit for context
+  const fileContents = readFilesAtParent(repoPath, parentSha, filesChanged)
+
+  let filesSection = ''
+  if (Object.keys(fileContents).length > 0) {
+    filesSection = `## File Contents (before the change)\n\n`
+    for (const [filePath, content] of Object.entries(fileContents)) {
+      filesSection += `### ${filePath}\n\`\`\`\n${content}\n\`\`\`\n\n`
+    }
+  }
 
-Files changed: ${filesChanged.join(', ')}
+  const userPrompt = `## Commit Message
+${message}
 
-Diff (first 3000 chars):
-${diff.slice(0, 3000)}`
+${filesSection}## Diff
+\`\`\`diff
+${diff}
+\`\`\``
 
   const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-promptgen-'))
   const promptFile = path.join(tmpDir, 'PROMPT_GEN.md')
 
   try {
-    fs.writeFileSync(promptFile, `${systemPrompt}\n\n---\n\n${userPrompt}`)
+    fs.writeFileSync(promptFile, `${PROMPT_GEN_SYSTEM}\n\n---\n\n${userPrompt}`)
 
     const output = execSync(
       `claude --dangerously-skip-permissions -p "Read ${promptFile} and follow all instructions. Respond with ONLY the task prompt text."`,
@@ -133,7 +219,7 @@ ${diff.slice(0, 3000)}`
       },
     ).trim()
 
-    return output || `${message}`
+    return output || message
   } catch {
     // Fallback to the commit message itself
     return message
@@ -144,7 +230,7 @@ ${diff.slice(0, 3000)}`
 
 /**
  * Build a full CommitTask from a SHA.
- * Returns null if the commit can't be used (merge, initial, etc).
+ * Returns null if the commit can't be used (merge, initial, too large diff, etc).
  */
 export async function buildCommitTask(
   repoPath: string,
@@ -153,8 +239,8 @@ export async function buildCommitTask(
   const info = getCommitInfo(repoPath, sha)
   if (!info) return null
 
-  // Skip commits with very large diffs (likely auto-generated)
-  if (info.diff.length > 50_000) {
+  // Skip commits with diffs that exceed our limit
+  if (info.diff.length > MAX_DIFF_CHARS) {
     console.log(`Skipping ${sha.slice(0, 8)}: diff too large (${info.diff.length} chars)`)
     return null
   }
@@ -165,6 +251,8 @@ export async function buildCommitTask(
   }
 
   const prompt = await generatePromptFromCommit(
+    repoPath,
+    info.parentSha,
     info.message,
     info.diff,
     info.filesChanged,