Skip to content

Commit 4543206

Browse files
committed
Update evals to use our sdk for prompting
1 parent adc5b80 commit 4543206

File tree

4 files changed

+150
-46
lines changed

4 files changed

+150
-46
lines changed

evals/git-evals/prompting-agent.ts

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import type { AgentDecisionSchema } from './types'
2+
import type { AgentDefinition } from '@codebuff/common/templates/initial-agents-dir/types/agent-definition'
3+
import type { z } from 'zod/v4'
4+
import { CodebuffClient } from '../../sdk/src'
5+
6+
// Agent definition for prompting
7+
const promptingAgentDefinition: AgentDefinition = {
8+
id: 'eval-prompting-agent',
9+
displayName: 'Evaluation Prompting Agent',
10+
model: 'openai/gpt-5-chat',
11+
toolNames: ['set_output', 'end_turn'],
12+
inputSchema: {
13+
prompt: {
14+
type: 'string',
15+
description: 'The evaluation context and conversation history',
16+
},
17+
},
18+
outputMode: 'structured_output',
19+
outputSchema: {
20+
type: 'object',
21+
properties: {
22+
reasoning: {
23+
type: 'string',
24+
description: 'Detailed reasoning for the decision',
25+
},
26+
decision: {
27+
type: 'string',
28+
enum: ['continue', 'complete', 'halt'],
29+
description:
30+
'Whether to continue, complete, or halt the implementation',
31+
},
32+
next_prompt: {
33+
type: 'string',
34+
description:
35+
'The next prompt to send to Codebuff (required if decision is continue)',
36+
},
37+
},
38+
required: ['decision', 'reasoning'],
39+
},
40+
systemPrompt: `You are an expert software engineer tasked with implementing a specification using Codebuff, an AI coding assistant. Your goal is to prompt Codebuff to implement the spec correctly.
41+
42+
You cannot and should not make changes yourself. You have no access to tools. You are merely prompting a coding assistant to make changes on your behalf in order to implement the spec.`,
43+
instructionsPrompt: `Analyze the conversation history and determine whether to:
44+
1. 'continue' - Generate a follow-up prompt for Codebuff the coding agent
45+
2. 'complete' - The implementation is done and satisfies the spec
46+
3. 'halt' - The implementation is off track and unlikely to be completed
47+
48+
If deciding to continue, include a clear, focused prompt for Codebuff in next_prompt. Don't ask to see files or specific code, instead you should mostly describe the changes you want to make (based on the spec). It's fine to ask questions if you need to in order best implement the spec. But keep in mind you only have a few turns to implement the spec.
49+
50+
Explain your reasoning in detail.
51+
52+
You must use the set_output tool to output your reasoning, decision and next_prompt.`,
53+
}
54+
55+
/**
56+
* Get the next evaluation prompt using the Codebuff SDK
57+
*/
58+
export async function getNextEvalPrompt({
59+
client,
60+
spec,
61+
conversationHistory,
62+
attemptsRemaining,
63+
}: {
64+
client: CodebuffClient
65+
spec: string
66+
conversationHistory: string
67+
attemptsRemaining: number
68+
}): Promise<z.infer<typeof AgentDecisionSchema>> {
69+
const prompt = `You are in a conversation with Codebuff, an AI coding assistant.
70+
Your conversation with Codebuff so far:
71+
<conversation>${conversationHistory}</conversation>
72+
73+
You need to implement the spec via prompting Codebuff the coding agent. You only have ${attemptsRemaining} turns left to implement the spec.
74+
75+
Current spec to implement:
76+
<spec>${spec}</spec>
77+
78+
Note that files can only be changed with tools. If no tools are called, no files were changed.
79+
80+
Analyze the conversation and decide your next action.`
81+
82+
const result = await client.run({
83+
agent: 'eval-prompting-agent',
84+
prompt,
85+
agentDefinitions: [promptingAgentDefinition],
86+
maxAgentSteps: 5,
87+
handleEvent: (event) => {
88+
console.log('event:', event)
89+
},
90+
})
91+
92+
const output = result.sessionState.mainAgentState.output
93+
if (output) {
94+
return output as z.infer<typeof AgentDecisionSchema>
95+
}
96+
97+
// Print this so we can debug when it doesn't output anything.
98+
console.error(
99+
'Error: no output from prompting agent. Message history:',
100+
result.sessionState.mainAgentState.messageHistory,
101+
)
102+
103+
// Fallback response
104+
return {
105+
decision: 'halt' as const,
106+
reasoning: `No valid response from prompting agent.`,
107+
next_prompt: undefined,
108+
}
109+
}

evals/git-evals/run-git-evals.ts

Lines changed: 31 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@ import fs from 'fs'
33
import path from 'path'
44

55
import { disableLiveUserInputCheck } from '@codebuff/backend/live-user-inputs'
6-
import { promptAiSdkStructured } from '@codebuff/backend/llm-apis/vercel-ai-sdk/ai-sdk'
7-
import { models } from '@codebuff/common/constants'
86
import { withTimeout } from '@codebuff/common/util/promise'
97
import { generateCompactId } from '@codebuff/common/util/string'
108
import pLimit from 'p-limit'
9+
import { getUserCredentials } from '@codebuff/npm-app/credentials'
1110

1211
import { resetRepoToCommit } from '../scaffolding'
1312
import { createInitialSessionState } from '../test-setup'
@@ -16,6 +15,7 @@ import { ClaudeRunner } from './runners/claude'
1615
import { CodebuffRunner } from './runners/codebuff'
1716
import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo'
1817
import { AgentDecisionSchema } from './types'
18+
import { getNextEvalPrompt } from './prompting-agent'
1919

2020
import type { AgentStep } from '../scaffolding'
2121
import type { Runner } from './runners/runner'
@@ -30,6 +30,8 @@ import type {
3030
} from './types'
3131
import type { z } from 'zod/v4'
3232
import type { ChildProcess } from 'child_process'
33+
import { CodebuffClient } from '../../sdk/src/client'
34+
import { API_KEY_ENV_VAR } from '@codebuff/common/constants'
3335

3436
disableLiveUserInputCheck()
3537

@@ -64,6 +66,16 @@ export async function runSingleEval(
6466
process.on('uncaughtException', uncaughtHandler)
6567
process.on('unhandledRejection', unhandledHandler)
6668

69+
// SDK client for prompting agent
70+
const apiKey = process.env[API_KEY_ENV_VAR] || getLocalAuthToken()
71+
const sdkClient = new CodebuffClient({
72+
apiKey,
73+
cwd: projectPath,
74+
onError: (error) => {
75+
throw new Error(`Prompting agent error: ${error.message}`)
76+
},
77+
})
78+
6779
try {
6880
// Reset to the commit before the target commit
6981
resetRepoToCommit(projectPath, `${evalCommit.sha}^`)
@@ -110,43 +122,21 @@ export async function runSingleEval(
110122
)
111123
.join('\n\n')
112124

113-
// Get next prompt from Sonnet agent with timeout
125+
// Get next prompt from prompting agent using Codebuff SDK
114126
let agentResponse: z.infer<typeof AgentDecisionSchema>
115127
try {
116-
agentResponse = await promptAiSdkStructured({
117-
messages: [
118-
{
119-
role: 'user',
120-
content: `You are an expert software engineer tasked with implementing a specification using CodeBuff, an AI coding assistant. Your goal is to prompt CodeBuff to implement the spec correctly. You are in a conversation with this coding agent.
121-
122-
Current spec to implement:
123-
<spec>${evalCommit.spec}</spec>
124-
125-
Your conversation with Codebuff so far:
126-
<conversation>${renderedTrace}</conversation>
127-
128-
Note that files can only be changed with tools. If no tools are called, no files were changed.
129-
130-
You must decide whether to:
131-
1. 'continue' - Generate a follow-up prompt for Codebuff
132-
2. 'complete' - The implementation is done and satisfies the spec
133-
3. 'halt' - The implementation is off track and unlikely to be completed within ${MAX_ATTEMPTS - attempts} more attempts
134-
135-
If deciding to continue, include a clear, focused prompt for Codebuff in next_prompt.
136-
Explain your reasoning in detail.`,
137-
},
138-
],
139-
schema: AgentDecisionSchema,
140-
model: models.gemini2_5_flash,
141-
clientSessionId,
142-
fingerprintId,
143-
userInputId: generateCompactId(),
144-
userId: undefined,
145-
timeout: 5 * 60_000, // 5 minute timeout
146-
})
128+
agentResponse = await withTimeout(
129+
getNextEvalPrompt({
130+
client: sdkClient,
131+
spec: evalCommit.spec,
132+
conversationHistory: renderedTrace,
133+
attemptsRemaining: MAX_ATTEMPTS - attempts,
134+
}),
135+
5 * 60_000, // 5 minute timeout
136+
)
147137
} catch (agentError) {
148138
throw new Error(
149-
`Agent decision failed: ${agentError instanceof Error ? agentError.message : String(agentError)}`,
139+
`Agent decision failed: ${agentError instanceof Error ? `${agentError.message}\n${JSON.stringify(agentError)}\n${agentError.stack}` : String(agentError)}`,
150140
)
151141
}
152142

@@ -197,6 +187,8 @@ Explain your reasoning in detail.`,
197187
process.on('unhandledRejection', handler)
198188
}
199189
})
190+
191+
sdkClient.closeConnection()
200192
}
201193

202194
// If we caught a process-level error, use that
@@ -249,6 +241,10 @@ Explain your reasoning in detail.`,
249241
}
250242
}
251243

244+
const getLocalAuthToken = () => {
245+
return getUserCredentials()?.authToken
246+
}
247+
252248
function getCodebuffFileStates(
253249
evalCommitSha: string,
254250
projectPath: string,

sdk/src/client.ts

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import { buildArray } from '@codebuff/common/util/array'
2-
31
import {
42
initialSessionState,
53
applyOverridesToSessionState,
@@ -54,7 +52,7 @@ export class CodebuffClient {
5452
>
5553
private readonly fingerprintId = `codebuff-sdk-${Math.random().toString(36).substring(2, 15)}`
5654

57-
private readonly promptIdValues: Record<
55+
private readonly promptIdToHandlers: Record<
5856
string,
5957
{
6058
handleEvent?: (event: PrintModeEvent) => void
@@ -95,12 +93,12 @@ export class CodebuffClient {
9593
const { userInputId, chunk } = action
9694
if (typeof chunk === 'string') {
9795
const handleStreamChunk =
98-
this.promptIdValues[userInputId]?.handleStreamChunk
96+
this.promptIdToHandlers[userInputId]?.handleStreamChunk
9997
if (handleStreamChunk) {
10098
handleStreamChunk(chunk)
10199
}
102100
} else {
103-
const handleEvent = this.promptIdValues[userInputId]?.handleEvent
101+
const handleEvent = this.promptIdToHandlers[userInputId]?.handleEvent
104102
if (handleEvent) {
105103
handleEvent(chunk)
106104
}
@@ -187,12 +185,12 @@ export class CodebuffClient {
187185
})
188186
}
189187
const toolResults = previousRun?.toolResults ?? []
190-
this.promptIdValues[promptId] = {
188+
this.promptIdToHandlers[promptId] = {
191189
handleEvent,
192190
handleStreamChunk,
193191
}
194192
if (customToolDefinitions) {
195-
this.promptIdValues[promptId].customToolHandler = async ({
193+
this.promptIdToHandlers[promptId].customToolHandler = async ({
196194
toolName,
197195
input,
198196
}) => {
@@ -246,7 +244,7 @@ export class CodebuffClient {
246244
})
247245

248246
return new Promise<RunState>((resolve, reject) => {
249-
this.promptIdValues[promptId].resolveResponse = { resolve, reject }
247+
this.promptIdToHandlers[promptId].resolveResponse = { resolve, reject }
250248
})
251249
}
252250

@@ -255,12 +253,12 @@ export class CodebuffClient {
255253
) {
256254
const promptId =
257255
action.type === 'prompt-response' ? action.promptId : action.userInputId
258-
const promiseActions = this.promptIdValues[promptId]?.resolveResponse
256+
const promiseActions = this.promptIdToHandlers[promptId]?.resolveResponse
259257
if (!promiseActions) {
260258
return
261259
}
262260

263-
delete this.promptIdValues[promptId]
261+
delete this.promptIdToHandlers[promptId]
264262

265263
if (action.type === 'prompt-error') {
266264
promiseActions.reject(new Error(action.message))
@@ -302,7 +300,7 @@ export class CodebuffClient {
302300
let result: string
303301
if (!toolNames.includes(toolName as ToolName)) {
304302
const customToolHandler =
305-
this.promptIdValues[action.userInputId].customToolHandler
303+
this.promptIdToHandlers[action.userInputId].customToolHandler
306304
if (!customToolHandler) {
307305
throw new Error(
308306
`Custom tool handler not found for user input ID ${action.userInputId}`,

sdk/src/websocket-client.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ export class WebSocketHandler {
9999

100100
public close() {
101101
this.cbWebSocket.close()
102+
this.isConnected = false
102103
}
103104

104105
private setupSubscriptions() {

0 commit comments

Comments
 (0)