Skip to content

Commit b8d91c5

Browse files
jahoomaclaude
andcommitted
Fix evalbuff signal quality: commit docs in test repos, isolate Claude calls, filter lockfiles
- Commit pre-copied docs in test repos so they don't appear in the agent's diff — fixes corrupted diff attribution where judges penalized agents for docs they didn't create - Run prompt generator and doc writer Claude calls with cwd=tmpDir to prevent them from reading the repo's CLAUDE.md/AGENTS.md - Filter lockfiles (bun.lock, package-lock.json, etc.) from diffs and file lists - Add 0.3-point minimum threshold for score comparisons to reduce noise - Cap improvement loop at 5 iterations - Pass edit history (accepted/rejected docs with scores) to the doc writer so it can avoid repeating rejected approaches and build on what worked Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 6d8bf39 commit b8d91c5

File tree

3 files changed

+132
-14
lines changed

3 files changed

+132
-14
lines changed

evalbuff/src/commit-task-generator.ts

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,28 @@ export interface CommitTask {
1414

1515
const MAX_DIFF_CHARS = 200_000
1616

17+
/**
18+
* Files that add noise to diffs without useful signal.
19+
* Lockfiles are huge and auto-generated — agents shouldn't replicate them.
20+
*/
21+
const NOISE_FILE_PATTERNS = [
22+
'bun.lock',
23+
'bun.lockb',
24+
'package-lock.json',
25+
'yarn.lock',
26+
'pnpm-lock.yaml',
27+
'Gemfile.lock',
28+
'Cargo.lock',
29+
'poetry.lock',
30+
'composer.lock',
31+
'go.sum',
32+
]
33+
34+
function isNoiseFile(filePath: string): boolean {
35+
const basename = filePath.split('/').pop() || ''
36+
return NOISE_FILE_PATTERNS.includes(basename)
37+
}
38+
1739
/**
1840
* Get a list of commits from the repo, oldest first.
1941
* Starts from `startAfterSha` (exclusive) or HEAD~commitCount if no state.
@@ -68,19 +90,24 @@ export function getCommitInfo(
6890
encoding: 'utf-8',
6991
}).trim()
7092

71-
// Get diff
72-
const diff = execSync(`git diff ${parentSha} ${sha}`, {
73-
cwd: repoPath,
74-
encoding: 'utf-8',
75-
maxBuffer: 10 * 1024 * 1024,
76-
})
77-
78-
// Get files changed
93+
// Get files changed (filter out noise files like lockfiles)
7994
const filesOutput = execSync(`git diff --name-only ${parentSha} ${sha}`, {
8095
cwd: repoPath,
8196
encoding: 'utf-8',
8297
}).trim()
83-
const filesChanged = filesOutput ? filesOutput.split('\n') : []
98+
const allFiles = filesOutput ? filesOutput.split('\n') : []
99+
const filesChanged = allFiles.filter((f) => !isNoiseFile(f))
100+
101+
// Get diff, excluding noise files (lockfiles etc.)
102+
const excludeArgs = NOISE_FILE_PATTERNS.map((p) => `':!${p}'`).join(' ')
103+
const diff = execSync(
104+
`git diff ${parentSha} ${sha} -- . ${excludeArgs}`,
105+
{
106+
cwd: repoPath,
107+
encoding: 'utf-8',
108+
maxBuffer: 10 * 1024 * 1024,
109+
},
110+
)
84111

85112
return { parentSha, message, diff, filesChanged }
86113
} catch {
@@ -124,6 +151,7 @@ function readFilesAtParent(
124151

125152
for (const filePath of filesChanged) {
126153
if (totalSize >= maxTotalSize) break
154+
if (isNoiseFile(filePath)) continue
127155

128156
const content = readFileAtCommit(repoPath, parentSha, filePath)
129157
if (content != null && content.length > 0) {
@@ -209,9 +237,12 @@ ${diff}
209237
try {
210238
fs.writeFileSync(promptFile, `${PROMPT_GEN_SYSTEM}\n\n---\n\n${userPrompt}`)
211239

240+
// IMPORTANT: Run in tmpDir to avoid Claude reading the repo's CLAUDE.md/AGENTS.md,
241+
// which can confuse prompt generation (e.g., generating prompts about evalbuff itself).
212242
const output = execSync(
213243
`claude --dangerously-skip-permissions -p "Read ${promptFile} and follow all instructions. Respond with ONLY the task prompt text."`,
214244
{
245+
cwd: tmpDir,
215246
encoding: 'utf-8',
216247
timeout: 2 * 60 * 1000,
217248
stdio: ['ignore', 'pipe', 'pipe'],
@@ -245,11 +276,17 @@ export async function buildCommitTask(
245276
return null
246277
}
247278

248-
// Skip commits with no meaningful code changes
279+
// Skip commits with no meaningful code changes (after filtering noise files)
249280
if (info.filesChanged.length === 0) {
250281
return null
251282
}
252283

284+
// Skip commits where the diff is empty after filtering noise files
285+
if (info.diff.trim().length === 0) {
286+
console.log(`Skipping ${sha.slice(0, 8)}: only noise files changed (lockfiles, etc.)`)
287+
return null
288+
}
289+
253290
const prompt = await generatePromptFromCommit(
254291
repoPath,
255292
info.parentSha,

evalbuff/src/docs-optimizer.ts

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,25 +63,53 @@ You MUST respond with ONLY a JSON object (no markdown fences, no explanation). T
6363
Or if too task-specific:
6464
{"skip": true, "reasoning": "explanation"}`
6565

66+
function formatEditHistory(history?: DocEditHistoryEntry[]): string {
67+
if (!history || history.length === 0) return ''
68+
69+
const lines = history.map((entry) => {
70+
const score =
71+
entry.scoreBefore != null && entry.scoreAfter != null
72+
? ` (score: ${entry.scoreBefore.toFixed(1)}${entry.scoreAfter.toFixed(1)})`
73+
: ''
74+
return `- **${entry.outcome.toUpperCase()}**: \`${entry.path}\`${score}\n Reasoning: ${entry.reasoning}`
75+
})
76+
77+
return `## Edit History (previous doc edits tried this session)
78+
79+
Use this history to avoid repeating rejected approaches and to build on what worked.
80+
81+
${lines.join('\n')}`
82+
}
83+
6684
/**
6785
* Analyze agent run results and suggest a doc edit to improve future performance.
6886
* Always analyzes — no score threshold check.
6987
* Returns null if the doc writer decides the failure is too task-specific to generalize.
7088
*/
89+
export interface DocEditHistoryEntry {
90+
path: string
91+
reasoning: string
92+
outcome: 'accepted' | 'rejected'
93+
scoreBefore?: number
94+
scoreAfter?: number
95+
}
96+
7197
export async function analyzeFailure({
7298
judgeResult,
7399
taskPrompt,
74100
agentDiff,
75101
agentTrace,
76102
groundTruthDiff,
77103
currentDocs,
104+
editHistory,
78105
}: {
79106
judgeResult: JudgingResult
80107
taskPrompt: string
81108
agentDiff: string
82109
agentTrace?: string // stdout from the agent — reasoning, tool calls, errors
83110
groundTruthDiff?: string // optional — not available in prompt mode
84111
currentDocs: Record<string, string>
112+
editHistory?: DocEditHistoryEntry[]
85113
}): Promise<DocSuggestion | null> {
86114
const docsContent = Object.entries(currentDocs)
87115
.map(([docPath, content]) => `### ${docPath}\n\`\`\`\n${content}\n\`\`\``)
@@ -145,6 +173,8 @@ ${traceSection}
145173
## Current Docs (already available to the agent)
146174
${docsContent || '(No docs yet)'}
147175
176+
${formatEditHistory(editHistory)}
177+
148178
Based on the agent's trace (if available), the gap between what the agent did and what it should have done, and the judge's analysis, write a doc file that captures a GENERAL PATTERN that would help the agent across many similar tasks. Focus on what the agent MISUNDERSTOOD (visible in the trace) rather than just what it got wrong (visible in the diff). If this failure doesn't reveal a generalizable pattern, respond with {"skip": true, "reasoning": "..."}.
149179
150180
Respond with ONLY the JSON object.`
@@ -156,9 +186,12 @@ Respond with ONLY the JSON object.`
156186

157187
let output: string
158188
try {
189+
// IMPORTANT: Run in tmpDir to avoid Claude reading the repo's CLAUDE.md/AGENTS.md,
190+
// which can pollute the doc writer's analysis with unrelated project context.
159191
output = execSync(
160192
`claude --dangerously-skip-permissions -p "Read the file ${promptFile} and follow all instructions in it. Respond with ONLY the JSON object as specified."`,
161193
{
194+
cwd: tmpDir,
162195
encoding: 'utf-8',
163196
timeout: 5 * 60 * 1000,
164197
stdio: ['ignore', 'pipe', 'pipe'],
@@ -298,13 +331,18 @@ export function revertDocEdit(
298331

299332
/**
300333
* Compare scores to determine if a doc edit improved things.
334+
* Requires a minimum improvement of 0.3 points to count as "improved"
335+
* to avoid accepting docs based on noise (especially with low parallelism).
301336
*/
337+
const MIN_IMPROVEMENT_THRESHOLD = 0.3
338+
302339
export function compareScores(
303340
oldScore: number,
304341
newScore: number,
305342
): 'improved' | 'same' | 'worse' {
306-
if (newScore > oldScore) return 'improved'
307-
if (newScore < oldScore) return 'worse'
343+
const delta = newScore - oldScore
344+
if (delta >= MIN_IMPROVEMENT_THRESHOLD) return 'improved'
345+
if (delta <= -MIN_IMPROVEMENT_THRESHOLD) return 'worse'
308346
return 'same'
309347
}
310348

evalbuff/src/run-evalbuff.ts

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,13 @@ async function runAgentsInParallel(opts: {
173173
}
174174
}
175175

176+
/**
177+
* Copy docs into a test repo and commit them so they don't appear in the agent's diff.
178+
*
179+
* Without this commit, `git diff HEAD` after the agent runs would include
180+
* the pre-copied docs as "new files", corrupting the diff attribution —
181+
* the judge would penalize or credit the agent for docs it didn't create.
182+
*/
176183
function copyDocsIntoRepo(
177184
sourceRepoPath: string,
178185
targetRepoPath: string,
@@ -182,11 +189,31 @@ function copyDocsIntoRepo(
182189
const targetDocsDir = path.join(targetRepoPath, 'docs')
183190
const targetAgentsMd = path.join(targetRepoPath, 'AGENTS.md')
184191

192+
let copied = false
185193
if (fs.existsSync(sourceDocsDir)) {
186194
fs.cpSync(sourceDocsDir, targetDocsDir, { recursive: true })
195+
copied = true
187196
}
188197
if (fs.existsSync(sourceAgentsMd)) {
189198
fs.cpSync(sourceAgentsMd, targetAgentsMd)
199+
copied = true
200+
}
201+
202+
// Commit the docs so they become part of HEAD — otherwise git diff HEAD
203+
// after the agent runs will include these docs as agent-created changes.
204+
if (copied) {
205+
try {
206+
execSync('git add docs/ AGENTS.md 2>/dev/null; git add -u docs/ AGENTS.md 2>/dev/null', {
207+
cwd: targetRepoPath,
208+
stdio: 'ignore',
209+
})
210+
execSync('git commit -m "evalbuff: pre-load docs" --allow-empty', {
211+
cwd: targetRepoPath,
212+
stdio: 'ignore',
213+
})
214+
} catch {
215+
// If nothing to commit, that's fine
216+
}
190217
}
191218
}
192219

@@ -213,8 +240,8 @@ async function improveDocs(opts: {
213240
}): Promise<{
214241
finalScore: number
215242
baselineScore: number
216-
docsKept: Array<{ path: string; reasoning: string }>
217-
docsRejected: Array<{ path: string; reasoning: string }>
243+
docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }>
244+
docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }>
218245
totalCost: number
219246
}> {
220247
const {
@@ -259,7 +286,14 @@ async function improveDocs(opts: {
259286

260287
// Step 2: Iterative doc improvement
261288
let improving = true
289+
const MAX_IMPROVEMENT_ITERATIONS = 5
290+
let iterationCount = 0
262291
while (improving) {
292+
iterationCount++
293+
if (iterationCount > MAX_IMPROVEMENT_ITERATIONS) {
294+
console.log(` Hit max improvement iterations (${MAX_IMPROVEMENT_ITERATIONS}), stopping.`)
295+
break
296+
}
263297
// Pick the worst-scoring judging for analysis
264298
const worstIdx = baseline.judgings.reduce(
265299
(minIdx, j, idx, arr) =>
@@ -273,13 +307,18 @@ async function improveDocs(opts: {
273307
const currentDocs = readCurrentDocs(repoPath)
274308

275309
console.log(` Analyzing for doc improvements...`)
310+
const editHistory = [
311+
...docsKept.map((d) => ({ ...d, outcome: 'accepted' as const })),
312+
...docsRejected.map((d) => ({ ...d, outcome: 'rejected' as const })),
313+
]
276314
const docSuggestion = await analyzeFailure({
277315
judgeResult: worstJudging,
278316
taskPrompt: prompt,
279317
agentDiff: worstDiff,
280318
agentTrace: worstTrace,
281319
groundTruthDiff,
282320
currentDocs,
321+
editHistory,
283322
})
284323

285324
if (!docSuggestion) {
@@ -325,6 +364,8 @@ async function improveDocs(opts: {
325364
docsKept.push({
326365
path: docSuggestion.suggestedDocPath,
327366
reasoning: docSuggestion.reasoning,
367+
scoreBefore: currentScore,
368+
scoreAfter: rerun.avgScore,
328369
})
329370

330371
// Commit the doc change
@@ -351,6 +392,8 @@ async function improveDocs(opts: {
351392
docsRejected.push({
352393
path: docSuggestion.suggestedDocPath,
353394
reasoning: docSuggestion.reasoning,
395+
scoreBefore: currentScore,
396+
scoreAfter: rerun.avgScore,
354397
})
355398

356399
// Revert the doc edit — restore previous content if it existed

0 commit comments

Comments
 (0)