@@ -21,15 +21,15 @@ function buildAnalysisPrompt(
2121 . map ( ( state ) => {
2222 const diff = createPatch ( state . path , state . preContent , state . postContent )
2323 let content = `File: ${ state . path } \n\nUnified Diff (Ground Truth):\n${ diff } `
24-
24+
2525 if ( includeBeforeContent ) {
2626 content += `\n\nPre-commit content:\n${ state . preContent } `
2727 }
28-
28+
2929 if ( includeAfterContent ) {
3030 content += `\n\nPost-commit content (Ground Truth):\n${ state . postContent } `
3131 }
32-
32+
3333 return content
3434 } )
3535 . join ( '\n\n---\n\n' )
@@ -39,25 +39,27 @@ function buildAnalysisPrompt(
3939 . map ( ( state ) => {
4040 const diff = createPatch ( state . path , state . preContent , state . postContent )
4141 let content = `File: ${ state . path } \n\nUnified Diff (Codebuff's Changes):\n${ diff } `
42-
42+
4343 if ( includeBeforeContent ) {
4444 content += `\n\nPre-commit content:\n${ state . preContent } `
4545 }
46-
46+
4747 if ( includeAfterContent ) {
4848 content += `\n\nPost-commit content (Codebuff's Attempt):\n${ state . postContent } `
4949 }
50-
50+
5151 return content
5252 } )
5353 . join ( '\n\n---\n\n' )
5454
5555 // Build trace section
56- const traceContent = truncatedTrace || evalRun . trace
57- . map ( ( { prompt, steps } ) =>
58- `Prompt: ${ prompt } \n\nCodebuff Steps: ${ JSON . stringify ( steps ) } ` . trim ( )
59- )
60- . join ( '\n\n' )
56+ const traceContent =
57+ truncatedTrace ||
58+ evalRun . trace
59+ . map ( ( { prompt, steps } ) =>
60+ `Prompt: ${ prompt } \n\nCodebuff Steps: ${ JSON . stringify ( steps ) } ` . trim ( )
61+ )
62+ . join ( '\n\n' )
6163
6264 return `You are an expert software engineer tasked with analyzing and scoring the code quality of changes made by an AI coding assistant (Codebuff). Please analyze the following interaction trace and compare both the attempted changes and the ground truth changes.
6365
@@ -107,25 +109,26 @@ Provide your response in a structured format with analysis, lists of strengths a
107109function truncateTraceFromEnd ( trace : any [ ] , maxTokens : number ) : string {
108110 // Start with full trace and progressively remove from the end
109111 let currentTrace = [ ...trace ]
110-
112+
111113 while ( currentTrace . length > 0 ) {
112114 const traceContent = currentTrace
113115 . map ( ( { prompt, steps } ) =>
114116 `Prompt: ${ prompt } \n\nCodebuff Steps: ${ JSON . stringify ( steps ) } ` . trim ( )
115117 )
116118 . join ( '\n\n' )
117-
119+
118120 if ( countTokens ( traceContent ) <= maxTokens ) {
119- const truncationNotice = currentTrace . length < trace . length
120- ? `\n\n[TRACE TRUNCATED: Showing ${ currentTrace . length } of ${ trace . length } trace entries to fit within token limit]`
121- : ''
121+ const truncationNotice =
122+ currentTrace . length < trace . length
123+ ? `\n\n[TRACE TRUNCATED: Showing ${ currentTrace . length } of ${ trace . length } trace entries to fit within token limit]`
124+ : ''
122125 return traceContent + truncationNotice
123126 }
124-
127+
125128 // Remove the last entry and try again
126129 currentTrace . pop ( )
127130 }
128-
131+
129132 return '[TRACE TRUNCATED: All trace entries removed to fit within token limit]'
130133}
131134
@@ -134,9 +137,21 @@ export async function judgeEvalRun(evalRun: EvalRunLog) {
134137
135138 // Try different levels of content inclusion until we fit within token limit
136139 const attempts = [
137- { includeBeforeContent : true , includeAfterContent : true , truncatedTrace : undefined } ,
138- { includeBeforeContent : false , includeAfterContent : true , truncatedTrace : undefined } ,
139- { includeBeforeContent : false , includeAfterContent : false , truncatedTrace : undefined } ,
140+ {
141+ includeBeforeContent : true ,
142+ includeAfterContent : true ,
143+ truncatedTrace : undefined ,
144+ } ,
145+ {
146+ includeBeforeContent : false ,
147+ includeAfterContent : true ,
148+ truncatedTrace : undefined ,
149+ } ,
150+ {
151+ includeBeforeContent : false ,
152+ includeAfterContent : false ,
153+ truncatedTrace : undefined ,
154+ } ,
140155 ]
141156
142157 for ( const attempt of attempts ) {
@@ -146,11 +161,13 @@ export async function judgeEvalRun(evalRun: EvalRunLog) {
146161 attempt . includeAfterContent ,
147162 attempt . truncatedTrace
148163 )
149-
164+
150165 const tokenCount = countTokens ( prompt )
151-
166+
152167 if ( tokenCount <= MAX_TOKENS ) {
153- console . log ( `Using prompt with ${ tokenCount } tokens (before: ${ attempt . includeBeforeContent } , after: ${ attempt . includeAfterContent } )` )
168+ console . log (
169+ `Using prompt with ${ tokenCount } tokens (before: ${ attempt . includeBeforeContent } , after: ${ attempt . includeAfterContent } )`
170+ )
154171 finalPrompt = prompt
155172 break
156173 }
@@ -163,28 +180,30 @@ export async function judgeEvalRun(evalRun: EvalRunLog) {
163180 { ...evalRun , trace : [ ] } , // Empty trace
164181 false , // includeBeforeContent
165182 false , // includeAfterContent
166- '' // empty trace content
183+ '' // empty trace content
167184 )
168185 const baseTokens = countTokens ( basePrompt )
169186 const maxTraceTokens = MAX_TOKENS - baseTokens - 100 // Reserve 100 tokens for truncation notice
170-
187+
171188 const truncatedTrace = truncateTraceFromEnd ( evalRun . trace , maxTraceTokens )
172-
189+
173190 finalPrompt = buildAnalysisPrompt (
174191 evalRun ,
175192 false , // includeBeforeContent
176193 false , // includeAfterContent
177194 truncatedTrace
178195 )
179-
196+
180197 const finalTokenCount = countTokens ( finalPrompt )
181- console . log ( `Using truncated prompt with ${ finalTokenCount } tokens (trace truncated, base: ${ baseTokens } , max trace: ${ maxTraceTokens } )` )
198+ console . log (
199+ `Using truncated prompt with ${ finalTokenCount } tokens (trace truncated, base: ${ baseTokens } , max trace: ${ maxTraceTokens } )`
200+ )
182201 }
183202
184203 // Run 3 judges in parallel
185204 console . log ( 'Running 3 judges in parallel for more robust scoring...' )
186-
187- const judgePromises = Array . from ( { length : 3 } , ( _ , index ) =>
205+
206+ const judgePromises = Array . from ( { length : 3 } , ( _ , index ) =>
188207 promptAiSdkStructured ( {
189208 messages : [ { role : 'user' , content : finalPrompt } ] ,
190209 schema : JudgingAnalysisSchema ,
@@ -193,14 +212,15 @@ export async function judgeEvalRun(evalRun: EvalRunLog) {
193212 fingerprintId : generateCompactId ( ) ,
194213 userInputId : generateCompactId ( ) ,
195214 userId : undefined ,
196- } ) . catch ( error => {
215+ timeout : 10 * 60 * 1000 , // 10 minute timeout
216+ } ) . catch ( ( error ) => {
197217 console . warn ( `Judge ${ index + 1 } failed:` , error )
198218 return null
199219 } )
200220 )
201221
202222 const judgeResults = await Promise . all ( judgePromises )
203- const validResults = judgeResults . filter ( result => result !== null )
223+ const validResults = judgeResults . filter ( ( result ) => result !== null )
204224
205225 if ( validResults . length === 0 ) {
206226 throw new Error ( 'All judges failed to provide results' )
@@ -209,11 +229,15 @@ export async function judgeEvalRun(evalRun: EvalRunLog) {
209229 console . log ( `Successfully got results from ${ validResults . length } /3 judges` )
210230
211231 // Sort judges by overall score and select the median
212- const sortedResults = validResults . sort ( ( a , b ) => a . metrics . overallScore - b . metrics . overallScore )
232+ const sortedResults = validResults . sort (
233+ ( a , b ) => a . metrics . overallScore - b . metrics . overallScore
234+ )
213235 const medianIndex = Math . floor ( sortedResults . length / 2 )
214236 const medianResult = sortedResults [ medianIndex ]
215237
216- console . log ( `Using median judge (${ medianIndex + 1 } of ${ sortedResults . length } ) with overall score: ${ medianResult . metrics . overallScore } ` )
238+ console . log (
239+ `Using median judge (${ medianIndex + 1 } of ${ sortedResults . length } ) with overall score: ${ medianResult . metrics . overallScore } `
240+ )
217241
218242 return medianResult
219243}
0 commit comments