Skip to content

Commit 14d370e

Browse files
committed
Eval tweaks
1 parent d942c68 commit 14d370e

File tree

3 files changed

+104
-87
lines changed

3 files changed

+104
-87
lines changed

evals/git-evals/judge-git-eval.ts

Lines changed: 59 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@ function buildAnalysisPrompt(
2121
.map((state) => {
2222
const diff = createPatch(state.path, state.preContent, state.postContent)
2323
let content = `File: ${state.path}\n\nUnified Diff (Ground Truth):\n${diff}`
24-
24+
2525
if (includeBeforeContent) {
2626
content += `\n\nPre-commit content:\n${state.preContent}`
2727
}
28-
28+
2929
if (includeAfterContent) {
3030
content += `\n\nPost-commit content (Ground Truth):\n${state.postContent}`
3131
}
32-
32+
3333
return content
3434
})
3535
.join('\n\n---\n\n')
@@ -39,25 +39,27 @@ function buildAnalysisPrompt(
3939
.map((state) => {
4040
const diff = createPatch(state.path, state.preContent, state.postContent)
4141
let content = `File: ${state.path}\n\nUnified Diff (Codebuff's Changes):\n${diff}`
42-
42+
4343
if (includeBeforeContent) {
4444
content += `\n\nPre-commit content:\n${state.preContent}`
4545
}
46-
46+
4747
if (includeAfterContent) {
4848
content += `\n\nPost-commit content (Codebuff's Attempt):\n${state.postContent}`
4949
}
50-
50+
5151
return content
5252
})
5353
.join('\n\n---\n\n')
5454

5555
// Build trace section
56-
const traceContent = truncatedTrace || evalRun.trace
57-
.map(({ prompt, steps }) =>
58-
`Prompt: ${prompt}\n\nCodebuff Steps: ${JSON.stringify(steps)}`.trim()
59-
)
60-
.join('\n\n')
56+
const traceContent =
57+
truncatedTrace ||
58+
evalRun.trace
59+
.map(({ prompt, steps }) =>
60+
`Prompt: ${prompt}\n\nCodebuff Steps: ${JSON.stringify(steps)}`.trim()
61+
)
62+
.join('\n\n')
6163

6264
return `You are an expert software engineer tasked with analyzing and scoring the code quality of changes made by an AI coding assistant (Codebuff). Please analyze the following interaction trace and compare both the attempted changes and the ground truth changes.
6365
@@ -107,25 +109,26 @@ Provide your response in a structured format with analysis, lists of strengths a
107109
function truncateTraceFromEnd(trace: any[], maxTokens: number): string {
108110
// Start with full trace and progressively remove from the end
109111
let currentTrace = [...trace]
110-
112+
111113
while (currentTrace.length > 0) {
112114
const traceContent = currentTrace
113115
.map(({ prompt, steps }) =>
114116
`Prompt: ${prompt}\n\nCodebuff Steps: ${JSON.stringify(steps)}`.trim()
115117
)
116118
.join('\n\n')
117-
119+
118120
if (countTokens(traceContent) <= maxTokens) {
119-
const truncationNotice = currentTrace.length < trace.length
120-
? `\n\n[TRACE TRUNCATED: Showing ${currentTrace.length} of ${trace.length} trace entries to fit within token limit]`
121-
: ''
121+
const truncationNotice =
122+
currentTrace.length < trace.length
123+
? `\n\n[TRACE TRUNCATED: Showing ${currentTrace.length} of ${trace.length} trace entries to fit within token limit]`
124+
: ''
122125
return traceContent + truncationNotice
123126
}
124-
127+
125128
// Remove the last entry and try again
126129
currentTrace.pop()
127130
}
128-
131+
129132
return '[TRACE TRUNCATED: All trace entries removed to fit within token limit]'
130133
}
131134

@@ -134,9 +137,21 @@ export async function judgeEvalRun(evalRun: EvalRunLog) {
134137

135138
// Try different levels of content inclusion until we fit within token limit
136139
const attempts = [
137-
{ includeBeforeContent: true, includeAfterContent: true, truncatedTrace: undefined },
138-
{ includeBeforeContent: false, includeAfterContent: true, truncatedTrace: undefined },
139-
{ includeBeforeContent: false, includeAfterContent: false, truncatedTrace: undefined },
140+
{
141+
includeBeforeContent: true,
142+
includeAfterContent: true,
143+
truncatedTrace: undefined,
144+
},
145+
{
146+
includeBeforeContent: false,
147+
includeAfterContent: true,
148+
truncatedTrace: undefined,
149+
},
150+
{
151+
includeBeforeContent: false,
152+
includeAfterContent: false,
153+
truncatedTrace: undefined,
154+
},
140155
]
141156

142157
for (const attempt of attempts) {
@@ -146,11 +161,13 @@ export async function judgeEvalRun(evalRun: EvalRunLog) {
146161
attempt.includeAfterContent,
147162
attempt.truncatedTrace
148163
)
149-
164+
150165
const tokenCount = countTokens(prompt)
151-
166+
152167
if (tokenCount <= MAX_TOKENS) {
153-
console.log(`Using prompt with ${tokenCount} tokens (before: ${attempt.includeBeforeContent}, after: ${attempt.includeAfterContent})`)
168+
console.log(
169+
`Using prompt with ${tokenCount} tokens (before: ${attempt.includeBeforeContent}, after: ${attempt.includeAfterContent})`
170+
)
154171
finalPrompt = prompt
155172
break
156173
}
@@ -163,28 +180,30 @@ export async function judgeEvalRun(evalRun: EvalRunLog) {
163180
{ ...evalRun, trace: [] }, // Empty trace
164181
false, // includeBeforeContent
165182
false, // includeAfterContent
166-
'' // empty trace content
183+
'' // empty trace content
167184
)
168185
const baseTokens = countTokens(basePrompt)
169186
const maxTraceTokens = MAX_TOKENS - baseTokens - 100 // Reserve 100 tokens for truncation notice
170-
187+
171188
const truncatedTrace = truncateTraceFromEnd(evalRun.trace, maxTraceTokens)
172-
189+
173190
finalPrompt = buildAnalysisPrompt(
174191
evalRun,
175192
false, // includeBeforeContent
176193
false, // includeAfterContent
177194
truncatedTrace
178195
)
179-
196+
180197
const finalTokenCount = countTokens(finalPrompt)
181-
console.log(`Using truncated prompt with ${finalTokenCount} tokens (trace truncated, base: ${baseTokens}, max trace: ${maxTraceTokens})`)
198+
console.log(
199+
`Using truncated prompt with ${finalTokenCount} tokens (trace truncated, base: ${baseTokens}, max trace: ${maxTraceTokens})`
200+
)
182201
}
183202

184203
// Run 3 judges in parallel
185204
console.log('Running 3 judges in parallel for more robust scoring...')
186-
187-
const judgePromises = Array.from({ length: 3 }, (_, index) =>
205+
206+
const judgePromises = Array.from({ length: 3 }, (_, index) =>
188207
promptAiSdkStructured({
189208
messages: [{ role: 'user', content: finalPrompt }],
190209
schema: JudgingAnalysisSchema,
@@ -193,14 +212,15 @@ export async function judgeEvalRun(evalRun: EvalRunLog) {
193212
fingerprintId: generateCompactId(),
194213
userInputId: generateCompactId(),
195214
userId: undefined,
196-
}).catch(error => {
215+
timeout: 10 * 60 * 1000, // 10 minute timeout
216+
}).catch((error) => {
197217
console.warn(`Judge ${index + 1} failed:`, error)
198218
return null
199219
})
200220
)
201221

202222
const judgeResults = await Promise.all(judgePromises)
203-
const validResults = judgeResults.filter(result => result !== null)
223+
const validResults = judgeResults.filter((result) => result !== null)
204224

205225
if (validResults.length === 0) {
206226
throw new Error('All judges failed to provide results')
@@ -209,11 +229,15 @@ export async function judgeEvalRun(evalRun: EvalRunLog) {
209229
console.log(`Successfully got results from ${validResults.length}/3 judges`)
210230

211231
// Sort judges by overall score and select the median
212-
const sortedResults = validResults.sort((a, b) => a.metrics.overallScore - b.metrics.overallScore)
232+
const sortedResults = validResults.sort(
233+
(a, b) => a.metrics.overallScore - b.metrics.overallScore
234+
)
213235
const medianIndex = Math.floor(sortedResults.length / 2)
214236
const medianResult = sortedResults[medianIndex]
215237

216-
console.log(`Using median judge (${medianIndex + 1} of ${sortedResults.length}) with overall score: ${medianResult.metrics.overallScore}`)
238+
console.log(
239+
`Using median judge (${medianIndex + 1} of ${sortedResults.length}) with overall score: ${medianResult.metrics.overallScore}`
240+
)
217241

218242
return medianResult
219243
}

evals/git-evals/post-eval-analysis.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,5 +188,6 @@ export async function analyzeEvalResults(
188188
fingerprintId: generateCompactId(),
189189
userInputId: generateCompactId(),
190190
userId: undefined,
191+
timeout: 10 * 60 * 1000, // 10 minute timeout
191192
})
192193
}

evals/git-evals/run-eval-set.ts

Lines changed: 44 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,9 @@ async function runEvalSet(options: {
126126
console.log(`Starting ${config.name} evaluation...`)
127127
const evalStartTime = Date.now()
128128

129+
let result
129130
try {
130-
const result = mockEval
131+
result = mockEval
131132
? mockRunGitEvals(MOCK_PATH)
132133
: await runGitEvals(
133134
config.evalDataPath,
@@ -136,57 +137,6 @@ async function runEvalSet(options: {
136137
config.limit,
137138
options.concurrency === 1
138139
)
139-
140-
const evalDuration = Date.now() - evalStartTime
141-
console.log(
142-
`✅ ${config.name} evaluation completed in ${(evalDuration / 1000).toFixed(1)}s`
143-
)
144-
145-
// Run post-eval analysis
146-
if (postEvalAnalysis) {
147-
console.log(`Running post-eval analysis for ${config.name}...`)
148-
try {
149-
const analysis = await analyzeEvalResults(result)
150-
console.log(`📊 Post-eval analysis completed for ${config.name}`)
151-
console.log(`\n=== ${config.name.toUpperCase()} ANALYSIS ===`)
152-
console.log(`Summary: ${analysis.summary}`)
153-
console.log(`\nTop Problems:`)
154-
analysis.problems.forEach((problem, i) => {
155-
console.log(
156-
`${i + 1}. [${problem.severity.toUpperCase()}] ${problem.title}`
157-
)
158-
console.log(
159-
` Frequency: ${(problem.frequency * 100).toFixed(1)}%`
160-
)
161-
console.log(` ${problem.description}`)
162-
})
163-
164-
return {
165-
name: config.name,
166-
status: 'success' as const,
167-
result,
168-
analysis,
169-
duration: evalDuration,
170-
}
171-
} catch (analysisError) {
172-
console.warn(
173-
`⚠️ Post-eval analysis failed for ${config.name}:`,
174-
analysisError
175-
)
176-
return {
177-
name: config.name,
178-
status: 'success' as const,
179-
result,
180-
duration: evalDuration,
181-
}
182-
}
183-
}
184-
return {
185-
name: config.name,
186-
status: 'success' as const,
187-
result,
188-
duration: evalDuration,
189-
}
190140
} catch (error) {
191141
const evalDuration = Date.now() - evalStartTime
192142
console.error(
@@ -200,9 +150,51 @@ async function runEvalSet(options: {
200150
duration: evalDuration,
201151
}
202152
}
153+
154+
const evalDuration = Date.now() - evalStartTime
155+
console.log(
156+
`✅ ${config.name} evaluation completed in ${(evalDuration / 1000).toFixed(1)}s`
157+
)
158+
159+
let analysis
160+
// Run post-eval analysis
161+
if (postEvalAnalysis) {
162+
console.log(`Running post-eval analysis for ${config.name}...`)
163+
try {
164+
analysis = await analyzeEvalResults(result)
165+
console.log(`📊 Post-eval analysis completed for ${config.name}`)
166+
console.log(`\n=== ${config.name.toUpperCase()} ANALYSIS ===`)
167+
console.log(`Summary: ${analysis.summary}`)
168+
console.log(`\nTop Problems:`)
169+
analysis.problems.forEach((problem, i) => {
170+
console.log(
171+
`${i + 1}. [${problem.severity.toUpperCase()}] ${problem.title}`
172+
)
173+
console.log(` Frequency: ${(problem.frequency * 100).toFixed(1)}%`)
174+
console.log(` ${problem.description}`)
175+
})
176+
} catch (analysisError) {
177+
console.warn(
178+
`⚠️ Post-eval analysis failed for ${config.name}:`,
179+
analysisError
180+
)
181+
}
182+
}
183+
184+
console.log('Completed analysis', !!analysis)
185+
186+
return {
187+
name: config.name,
188+
status: 'success' as const,
189+
result,
190+
analysis,
191+
duration: evalDuration,
192+
}
203193
})
204194

195+
console.log('Running evalPromises')
205196
const settledResults = await Promise.allSettled(evalPromises)
197+
console.log('Settled results', settledResults.length)
206198
settledResults.forEach((res, index) => {
207199
if (res.status === 'fulfilled') {
208200
results.push(res.value)

0 commit comments

Comments
 (0)