evals support choosing agent

jahooma · jahooma · commit c110170b6968 · 2025-08-26T23:17:00.000-07:00
diff --git a/evals/git-evals/run-eval-set.ts b/evals/git-evals/run-eval-set.ts
@@ -70,6 +70,10 @@ class RunEvalSetCommand extends Command {
       description: 'Coding agent to use',
       default: 'codebuff',
     }),
+    agent: Flags.string({
+      description: 'Codebuff agent id to use',
+      default: 'base',
+    }),
     help: Flags.help({ char: 'h' }),
   }
 
@@ -89,6 +93,7 @@ async function runEvalSet(options: {
   title?: string
   concurrency?: number
   'coding-agent': string
+  agent: string
 }): Promise<void> {
   const {
     'output-dir': outputDir,
@@ -98,6 +103,7 @@ async function runEvalSet(options: {
     insert: shouldInsert,
     title,
     'coding-agent': codingAgentstr,
+    agent,
   } = options
 
   if (!['codebuff', 'claude'].includes(codingAgentstr)) {
@@ -127,32 +133,28 @@ async function runEvalSet(options: {
       name: 'codebuff',
       evalDataPath: path.join(__dirname, 'eval-codebuff2.json'),
       outputDir,
-      agentType: undefined,
     },
     {
       name: 'manifold',
       evalDataPath: path.join(__dirname, 'eval-manifold2.json'),
       outputDir,
-      agentType: undefined,
     },
     {
       name: 'plane',
       evalDataPath: path.join(__dirname, 'eval-plane.json'),
       outputDir,
-      agentType: undefined,
     },
     {
       name: 'saleor',
       evalDataPath: path.join(__dirname, 'eval-saleor.json'),
       outputDir,
-      agentType: undefined,
     },
   ]
 
   console.log(`Running ${evalConfigs.length} evaluations:`)
   evalConfigs.forEach((config) => {
     console.log(
-      `  - ${config.name}: ${config.evalDataPath} -> ${config.outputDir} (${config.agentType})`,
+      `  - ${config.name}: ${config.evalDataPath} -> ${config.outputDir} (${agent})`,
     )
   })
 
@@ -174,6 +176,7 @@ async function runEvalSet(options: {
             codingAgent,
             config.limit,
             options.concurrency === 1,
+            agent,
           )
     } catch (error) {
       const evalDuration = Date.now() - evalStartTime
@@ -360,7 +363,7 @@ async function runEvalSet(options: {
           const payload: GitEvalResultRequest = {
             cost_mode: 'normal', // You can modify this based on your needs
             reasoner_model: undefined, // No longer using model config
-            agent_model: config?.agentType,
+            agent_model: agent,
             metadata: {
               numCases: evalResult?.overall_metrics?.total_runs,
               avgScore: evalResult?.overall_metrics?.average_overall,
diff --git a/evals/git-evals/run-git-evals.ts b/evals/git-evals/run-git-evals.ts
@@ -346,6 +346,7 @@ export async function runGitEvals(
   codingAgent: 'codebuff' | 'claude',
   limit?: number,
   logToStdout: boolean = false,
+  agent: string = 'base',
 ): Promise<FullEvalLog> {
   // Set up signal handlers if this is the main module
   if (require.main === module) {
@@ -453,6 +454,7 @@ export async function runGitEvals(
                 clientSessionId,
                 fingerprintId,
                 codingAgent,
+                agent,
               ],
               {
                 stdio: ['pipe', 'pipe', 'pipe', 'ipc'],
diff --git a/evals/git-evals/run-single-eval-process.ts b/evals/git-evals/run-single-eval-process.ts
@@ -30,14 +30,16 @@ async function main() {
     clientSessionId,
     fingerprintId,
     codingAgent,
+    agent,
   ] = process.argv.slice(2)
 
   if (
     !evalCommitFilePath ||
     !projectPath ||
     !clientSessionId ||
     !fingerprintId ||
-    !codingAgent
+    !codingAgent ||
+    !agent
   ) {
     console.error('Missing required arguments for single eval process')
     process.exit(1)
@@ -71,6 +73,7 @@ async function main() {
       clientSessionId,
       fingerprintId,
       codingAgent as any,
+      agent,
     )
 
     // Check again after long-running operation
diff --git a/evals/git-evals/types.ts b/evals/git-evals/types.ts
@@ -104,7 +104,6 @@ export interface EvalConfig {
   name: string
   evalDataPath: string
   outputDir: string
-  agentType?: string
   limit?: number
 }
 

Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,6 @@ export interface EvalConfig {`
`104`	`104`	`name: string`
`105`	`105`	`evalDataPath: string`
`106`	`106`	`outputDir: string`
`107`		`- agentType?: string`
`108`	`107`	`limit?: number`
`109`	`108`	`}`
`110`	`109`