Skip to content

Commit 964f441

Browse files
committed
Revert using Codebuff sdk for prompting agent
1 parent 9e86141 commit 964f441

File tree

1 file changed

+35
-31
lines changed

1 file changed

+35
-31
lines changed

evals/git-evals/run-git-evals.ts

Lines changed: 35 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,19 @@ import fs from 'fs'
33
import path from 'path'
44

55
import { disableLiveUserInputCheck } from '@codebuff/backend/live-user-inputs'
6+
import { promptAiSdkStructured } from '@codebuff/backend/llm-apis/vercel-ai-sdk/ai-sdk'
7+
import { models } from '@codebuff/common/constants'
68
import { withTimeout } from '@codebuff/common/util/promise'
79
import { generateCompactId } from '@codebuff/common/util/string'
810
import pLimit from 'p-limit'
9-
import { getUserCredentials } from '@codebuff/npm-app/credentials'
1011

1112
import { resetRepoToCommit } from '../scaffolding'
1213
import { createInitialSessionState } from '../test-setup'
13-
import { judgeEvalRun } from './judge-git-eval'
1414
import { ClaudeRunner } from './runners/claude'
1515
import { CodebuffRunner } from './runners/codebuff'
1616
import { extractRepoNameFromUrl, setupTestRepo } from './setup-test-repo'
1717
import { AgentDecisionSchema } from './types'
18-
import { getNextEvalPrompt } from './prompting-agent'
18+
import { judgeEvalRun } from './judge-git-eval'
1919

2020
import type { AgentStep } from '../scaffolding'
2121
import type { Runner } from './runners/runner'
@@ -30,8 +30,6 @@ import type {
3030
} from './types'
3131
import type { z } from 'zod/v4'
3232
import type { ChildProcess } from 'child_process'
33-
import { CodebuffClient } from '../../sdk/src/client'
34-
import { API_KEY_ENV_VAR } from '@codebuff/common/constants'
3533

3634
disableLiveUserInputCheck()
3735

@@ -66,16 +64,6 @@ export async function runSingleEval(
6664
process.on('uncaughtException', uncaughtHandler)
6765
process.on('unhandledRejection', unhandledHandler)
6866

69-
// SDK client for prompting agent
70-
const apiKey = process.env[API_KEY_ENV_VAR] || getLocalAuthToken()
71-
const sdkClient = new CodebuffClient({
72-
apiKey,
73-
cwd: projectPath,
74-
onError: (error) => {
75-
throw new Error(`Prompting agent error: ${error.message}`)
76-
},
77-
})
78-
7967
try {
8068
// Reset to the commit before the target commit
8169
resetRepoToCommit(projectPath, `${evalCommit.sha}^`)
@@ -122,18 +110,40 @@ export async function runSingleEval(
122110
)
123111
.join('\n\n')
124112

125-
// Get next prompt from prompting agent using Codebuff SDK
113+
// Get next prompt from prompting agent with timeout
126114
let agentResponse: z.infer<typeof AgentDecisionSchema>
127115
try {
128-
agentResponse = await withTimeout(
129-
getNextEvalPrompt({
130-
client: sdkClient,
131-
spec: evalCommit.spec,
132-
conversationHistory: renderedTrace,
133-
attemptsRemaining: MAX_ATTEMPTS - attempts,
134-
}),
135-
5 * 60_000, // 5 minute timeout
136-
)
116+
agentResponse = await promptAiSdkStructured({
117+
messages: [
118+
{
119+
role: 'user',
120+
content: `You are an expert software engineer tasked with implementing a specification using CodeBuff, an AI coding assistant. Your goal is to prompt CodeBuff to implement the spec correctly. You are in a conversation with this coding agent.
121+
122+
Current spec to implement:
123+
<spec>${evalCommit.spec}</spec>
124+
125+
Your conversation with Codebuff so far:
126+
<conversation>${renderedTrace}</conversation>
127+
128+
Note that files can only be changed with tools. If no tools are called, no files were changed.
129+
130+
You must decide whether to:
131+
1. 'continue' - Generate a follow-up prompt for Codebuff
132+
2. 'complete' - The implementation is done and satisfies the spec
133+
3. 'halt' - The implementation is off track and unlikely to be completed within ${MAX_ATTEMPTS - attempts} more attempts
134+
135+
If deciding to continue, include a clear, focused prompt for Codebuff in next_prompt. Note that Codebuff does not have access to the spec, so you must describe the changes you want Codebuff to make in a way that is clear and concise.
136+
Explain your reasoning in detail.`,
137+
},
138+
],
139+
schema: AgentDecisionSchema,
140+
model: models.gemini2_5_flash,
141+
clientSessionId,
142+
fingerprintId,
143+
userInputId: generateCompactId(),
144+
userId: undefined,
145+
timeout: 5 * 60_000, // 5 minute timeout
146+
})
137147
} catch (agentError) {
138148
throw new Error(
139149
`Agent decision failed: ${agentError instanceof Error ? `${agentError.message}\n${JSON.stringify(agentError)}\n${agentError.stack}` : String(agentError)}`,
@@ -187,8 +197,6 @@ export async function runSingleEval(
187197
process.on('unhandledRejection', handler)
188198
}
189199
})
190-
191-
sdkClient.closeConnection()
192200
}
193201

194202
// If we caught a process-level error, use that
@@ -241,10 +249,6 @@ export async function runSingleEval(
241249
}
242250
}
243251

244-
const getLocalAuthToken = () => {
245-
return getUserCredentials()?.authToken
246-
}
247-
248252
function getCodebuffFileStates(
249253
evalCommitSha: string,
250254
projectPath: string,

0 commit comments

Comments
 (0)