Skip to content

Commit b238947

Browse files
jahoomaclaude
andcommitted
evalbuff: rewrite docs-optimizer to use Claude CLI instead of CodebuffClient
Removes the CodebuffClient/SDK dependency from analyzeFailure. Uses Claude CLI with a temp file for the prompt (avoids CLI arg length limits). Adds JSON extraction with markdown fence stripping and validation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7122c05 commit b238947

File tree

1 file changed

+59
-53
lines changed

1 file changed

+59
-53
lines changed

evals/evalbuff/docs-optimizer.ts

Lines changed: 59 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,17 @@
1+
import { execSync } from 'child_process'
12
import fs from 'fs'
3+
import os from 'os'
24
import path from 'path'
35

4-
import { withTimeout } from '@codebuff/common/util/promise'
5-
66
import type { JudgingResult } from './judge'
7-
import type { AgentDefinition, CodebuffClient } from '@codebuff/sdk'
87

98
export interface DocSuggestion {
109
reasoning: string
1110
suggestedDocPath: string // relative to docs/, e.g. "coding-patterns/error-handling.md"
1211
suggestedContent: string
1312
}
1413

15-
const docWriterAgent: AgentDefinition = {
16-
id: 'doc-writer',
17-
model: 'anthropic/claude-sonnet-4.5',
18-
displayName: 'Doc Writer',
19-
toolNames: ['set_output'],
20-
inputSchema: {
21-
prompt: { type: 'string', description: 'The analysis prompt' },
22-
},
23-
outputMode: 'structured_output',
24-
outputSchema: {
25-
type: 'object',
26-
properties: {
27-
reasoning: {
28-
type: 'string',
29-
description:
30-
'Why this doc would help the agent avoid the identified failure',
31-
},
32-
suggestedDocPath: {
33-
type: 'string',
34-
description:
35-
'File path relative to docs/ directory, e.g. "patterns/error-handling.md"',
36-
},
37-
suggestedContent: {
38-
type: 'string',
39-
description: 'The markdown content to write to the doc file',
40-
},
41-
},
42-
required: ['reasoning', 'suggestedDocPath', 'suggestedContent'],
43-
},
44-
systemPrompt: `You are an expert at writing developer documentation that helps AI coding agents perform better.
14+
const DOC_WRITER_SYSTEM_PROMPT = `You are an expert at writing developer documentation that helps AI coding agents perform better.
4515
4616
Your job: Given a coding agent's failure on a task, write a targeted documentation file that would prevent this class of error in the future.
4717
@@ -53,29 +23,37 @@ Your job: Given a coding agent's failure on a task, write a targeted documentati
5323
4. Write docs that a coding agent will read and immediately know what to do differently.
5424
5. Keep docs concise — under 200 lines. Dense information beats verbose explanations.
5525
6. Use a logical file path that groups related docs together (e.g., "patterns/", "conventions/", "architecture/").
56-
7. Include examples of correct patterns from the codebase when possible.`,
57-
}
26+
7. Include examples of correct patterns from the codebase when possible.
27+
28+
## Output Format
29+
30+
You MUST respond with ONLY a JSON object (no markdown fences, no explanation). The JSON must have exactly these fields:
31+
{
32+
"reasoning": "Why this doc would help",
33+
"suggestedDocPath": "path/relative/to/docs/dir.md",
34+
"suggestedContent": "The markdown content"
35+
}`
5836

5937
/**
6038
* Analyze a failure and suggest a doc edit to prevent it.
39+
* Uses Claude CLI to generate suggestions.
6140
* Returns null if score is above threshold (no improvement needed).
6241
*/
6342
export async function analyzeFailure({
64-
client,
6543
judgeResult,
6644
taskPrompt,
6745
agentDiff,
6846
groundTruthDiff,
6947
currentDocs,
7048
scoreThreshold,
7149
}: {
72-
client: CodebuffClient
7350
judgeResult: JudgingResult
7451
taskPrompt: string
7552
agentDiff: string
7653
groundTruthDiff: string
7754
currentDocs: Record<string, string>
7855
scoreThreshold: number
56+
client?: unknown // kept for backwards compat, ignored
7957
}): Promise<DocSuggestion | null> {
8058
if (judgeResult.overallScore >= scoreThreshold) {
8159
return null
@@ -85,7 +63,9 @@ export async function analyzeFailure({
8563
.map(([docPath, content]) => `### ${docPath}\n\`\`\`\n${content}\n\`\`\``)
8664
.join('\n\n')
8765

88-
const prompt = `## Task Prompt
66+
const prompt = `${DOC_WRITER_SYSTEM_PROMPT}
67+
68+
## Task Prompt
8969
${taskPrompt}
9070
9171
## Judge Analysis
@@ -107,26 +87,47 @@ ${agentDiff || '(No changes made)'}
10787
## Current Docs (already available to the agent)
10888
${docsContent || '(No docs yet)'}
10989
110-
Based on the gap between what the agent did and what it should have done, write a doc file that would help the agent get it right next time. Focus on the specific weakness identified by the judge.`
90+
Based on the gap between what the agent did and what it should have done, write a doc file that would help the agent get it right next time. Focus on the specific weakness identified by the judge.
91+
92+
Respond with ONLY the JSON object.`
11193

11294
try {
113-
const result = await withTimeout(
114-
client.run({
115-
agent: docWriterAgent.id,
116-
prompt,
117-
agentDefinitions: [docWriterAgent],
118-
handleEvent: () => {},
119-
}),
120-
10 * 60 * 1000,
121-
'Doc writer agent timed out after 10 minutes',
122-
)
123-
124-
if (result.output.type !== 'structuredOutput') {
125-
console.error('Doc writer did not return structured output')
95+
// Write prompt to temp file to avoid CLI arg length limits
96+
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docwriter-'))
97+
const promptFile = path.join(tmpDir, 'DOC_WRITER_PROMPT.md')
98+
fs.writeFileSync(promptFile, prompt)
99+
100+
let output: string
101+
try {
102+
output = execSync(
103+
`claude --dangerously-skip-permissions -p "Read the file ${promptFile} and follow all instructions in it. Respond with ONLY the JSON object as specified."`,
104+
{
105+
encoding: 'utf-8',
106+
timeout: 5 * 60 * 1000,
107+
stdio: ['ignore', 'pipe', 'pipe'],
108+
maxBuffer: 10 * 1024 * 1024,
109+
},
110+
).trim()
111+
} finally {
112+
fs.rmSync(tmpDir, { recursive: true, force: true })
113+
}
114+
115+
// Try to extract JSON from the output
116+
let jsonStr = output
117+
// Strip markdown code fences if present
118+
const jsonMatch = output.match(/```(?:json)?\s*\n([\s\S]*?)\n\s*```/)
119+
if (jsonMatch) {
120+
jsonStr = jsonMatch[1]
121+
}
122+
// Try to find a JSON object
123+
const objMatch = jsonStr.match(/\{[\s\S]*\}/)
124+
if (!objMatch) {
125+
console.error('Doc writer did not return JSON')
126126
return null
127127
}
128128

129-
const value = result.output.value as DocSuggestion
129+
const value = JSON.parse(objMatch[0]) as DocSuggestion
130+
130131
// Validate the path is under docs/
131132
if (
132133
value.suggestedDocPath.startsWith('/') ||
@@ -138,6 +139,11 @@ Based on the gap between what the agent did and what it should have done, write
138139
return null
139140
}
140141

142+
if (!value.reasoning || !value.suggestedDocPath || !value.suggestedContent) {
143+
console.error('Doc writer returned incomplete suggestion')
144+
return null
145+
}
146+
141147
return value
142148
} catch (error) {
143149
console.error('Doc writer failed:', error)

0 commit comments

Comments
 (0)