Skip to content

Commit 05009f4

Browse files
feat(evals): Add build script and fix related type errors
Adds a "build": "tsc" script to evals/package.json. This change also includes fixes for type errors that surfaced after adding the build script, primarily by refactoring calls to promptAiSdkStructured to use the updated single options object signature. Generated with Codebuff 🤖 Co-Authored-By: Codebuff <noreply@codebuff.com>
1 parent 8e78927 commit 05009f4

File tree

5 files changed

+40
-46
lines changed

5 files changed

+40
-46
lines changed

evals/git-evals/gen-evals.ts

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -125,17 +125,15 @@ async function selectSubstantialCommits(
125125

126126
const prompt = `${COMMIT_SELECTION_PROMPT}\n\nCommits to evaluate:\n\n${commitsInfo}`
127127

128-
const response = await promptAiSdkStructured(
129-
[{ role: 'user', content: prompt }],
130-
{
131-
schema: CommitSelectionSchema,
132-
model: claudeModels.sonnet,
133-
clientSessionId,
134-
fingerprintId,
135-
userInputId,
136-
userId: undefined,
137-
}
138-
)
128+
const response = await promptAiSdkStructured({
129+
messages: [{ role: 'user', content: prompt }],
130+
schema: CommitSelectionSchema,
131+
model: claudeModels.sonnet,
132+
clientSessionId,
133+
fingerprintId,
134+
userInputId,
135+
userId: undefined,
136+
})
139137

140138
try {
141139
return commits
@@ -237,17 +235,15 @@ Commit Message: ${commit.message}
237235
Changes Made:
238236
${diff}`
239237

240-
const { spec } = await promptAiSdkStructured(
241-
[{ role: 'user', content: prompt }],
242-
{
243-
schema: z.object({ spec: z.string() }),
244-
model: geminiModels.gemini2_5_pro_preview,
245-
clientSessionId,
246-
fingerprintId,
247-
userInputId,
248-
userId: undefined,
249-
}
250-
)
238+
const { spec } = await promptAiSdkStructured({
239+
messages: [{ role: 'user', content: prompt }],
240+
schema: z.object({ spec: z.string() }),
241+
model: geminiModels.gemini2_5_pro_preview,
242+
clientSessionId,
243+
fingerprintId,
244+
userInputId,
245+
userId: undefined,
246+
})
251247
return { spec, fileStates }
252248
}
253249

evals/git-evals/judge-git-eval.ts

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -181,15 +181,13 @@ export function judgeEvalRun(evalRun: EvalRunLog) {
181181
console.log(`Using truncated prompt with ${finalTokenCount} tokens (trace truncated, base: ${baseTokens}, max trace: ${maxTraceTokens})`)
182182
}
183183

184-
return promptAiSdkStructured(
185-
[{ role: 'user', content: finalPrompt }],
186-
{
187-
schema: JudgingAnalysisSchema,
188-
model: geminiModels.gemini2_5_pro_preview,
189-
clientSessionId: generateCompactId(),
190-
fingerprintId: generateCompactId(),
191-
userInputId: generateCompactId(),
192-
userId: undefined,
193-
}
194-
)
184+
return promptAiSdkStructured({
185+
messages: [{ role: 'user', content: finalPrompt }],
186+
schema: JudgingAnalysisSchema,
187+
model: geminiModels.gemini2_5_pro_preview,
188+
clientSessionId: generateCompactId(),
189+
fingerprintId: generateCompactId(),
190+
userInputId: generateCompactId(),
191+
userId: undefined,
192+
})
195193
}

evals/git-evals/post-eval-analysis.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ export async function analyzeEvalResults(
180180
const tokenCount = countTokens(finalPrompt)
181181
console.log(`Post-eval analysis prompt: ${tokenCount} tokens`)
182182

183-
return promptAiSdkStructured([{ role: 'user', content: finalPrompt }], {
183+
return promptAiSdkStructured({
184+
messages: [{ role: 'user', content: finalPrompt }],
184185
schema: PostEvalAnalysisSchema,
185186
model: geminiModels.gemini2_5_pro_preview,
186187
clientSessionId: generateCompactId(),

evals/git-evals/run-git-evals.ts

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@ async function runSingleEval(
8888
// Get next prompt from Sonnet agent with timeout
8989
let agentResponse: any
9090
try {
91-
agentResponse = await promptAiSdkStructured(
92-
[
91+
agentResponse = await promptAiSdkStructured({
92+
messages: [
9393
{
9494
role: 'user',
9595
content: `You are an expert software engineer tasked with implementing a specification using CodeBuff, an AI coding assistant. Your goal is to prompt CodeBuff to implement the spec correctly. You are in a conversation with this coding agent.
@@ -109,16 +109,14 @@ If deciding to continue, include a clear, focused prompt for Codebuff in next_pr
109109
Explain your reasoning in detail.`,
110110
},
111111
],
112-
{
113-
schema: AgentDecisionSchema,
114-
model: claudeModels.sonnet,
115-
clientSessionId,
116-
fingerprintId,
117-
userInputId: generateCompactId(),
118-
userId: undefined,
119-
timeout: 5 * 60_000, // 5 minute timeout
120-
}
121-
)
112+
schema: AgentDecisionSchema,
113+
model: claudeModels.sonnet,
114+
clientSessionId,
115+
fingerprintId,
116+
userInputId: generateCompactId(),
117+
userId: undefined,
118+
timeout: 5 * 60_000, // 5 minute timeout
119+
})
122120
} catch (agentError) {
123121
throw new Error(
124122
`Agent decision failed: ${agentError instanceof Error ? agentError.message : String(agentError)}`

evals/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
"test:swe-bench": "bun test swe-bench.test.ts",
1010
"test:e2e-cat-app": "bun run e2e-cat-app-script.ts",
1111
"typecheck": "tsc --noEmit",
12+
"build": "tsc",
1213
"gen-git-evals": "bun run git-evals/gen-evals.ts",
1314
"run-git-evals": "bun run git-evals/run-git-evals.ts",
1415
"run-eval-set": "bun run run-eval-set.ts",

0 commit comments

Comments
 (0)