Skip to content

Commit c110170

Browse files
committed
evals support choosing agent
1 parent c172ed3 commit c110170

File tree

4 files changed

+15
-8
lines changed

4 files changed

+15
-8
lines changed

evals/git-evals/run-eval-set.ts

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ class RunEvalSetCommand extends Command {
7070
description: 'Coding agent to use',
7171
default: 'codebuff',
7272
}),
73+
agent: Flags.string({
74+
description: 'Codebuff agent id to use',
75+
default: 'base',
76+
}),
7377
help: Flags.help({ char: 'h' }),
7478
}
7579

@@ -89,6 +93,7 @@ async function runEvalSet(options: {
8993
title?: string
9094
concurrency?: number
9195
'coding-agent': string
96+
agent: string
9297
}): Promise<void> {
9398
const {
9499
'output-dir': outputDir,
@@ -98,6 +103,7 @@ async function runEvalSet(options: {
98103
insert: shouldInsert,
99104
title,
100105
'coding-agent': codingAgentstr,
106+
agent,
101107
} = options
102108

103109
if (!['codebuff', 'claude'].includes(codingAgentstr)) {
@@ -127,32 +133,28 @@ async function runEvalSet(options: {
127133
name: 'codebuff',
128134
evalDataPath: path.join(__dirname, 'eval-codebuff2.json'),
129135
outputDir,
130-
agentType: undefined,
131136
},
132137
{
133138
name: 'manifold',
134139
evalDataPath: path.join(__dirname, 'eval-manifold2.json'),
135140
outputDir,
136-
agentType: undefined,
137141
},
138142
{
139143
name: 'plane',
140144
evalDataPath: path.join(__dirname, 'eval-plane.json'),
141145
outputDir,
142-
agentType: undefined,
143146
},
144147
{
145148
name: 'saleor',
146149
evalDataPath: path.join(__dirname, 'eval-saleor.json'),
147150
outputDir,
148-
agentType: undefined,
149151
},
150152
]
151153

152154
console.log(`Running ${evalConfigs.length} evaluations:`)
153155
evalConfigs.forEach((config) => {
154156
console.log(
155-
` - ${config.name}: ${config.evalDataPath} -> ${config.outputDir} (${config.agentType})`,
157+
` - ${config.name}: ${config.evalDataPath} -> ${config.outputDir} (${agent})`,
156158
)
157159
})
158160

@@ -174,6 +176,7 @@ async function runEvalSet(options: {
174176
codingAgent,
175177
config.limit,
176178
options.concurrency === 1,
179+
agent,
177180
)
178181
} catch (error) {
179182
const evalDuration = Date.now() - evalStartTime
@@ -360,7 +363,7 @@ async function runEvalSet(options: {
360363
const payload: GitEvalResultRequest = {
361364
cost_mode: 'normal', // You can modify this based on your needs
362365
reasoner_model: undefined, // No longer using model config
363-
agent_model: config?.agentType,
366+
agent_model: agent,
364367
metadata: {
365368
numCases: evalResult?.overall_metrics?.total_runs,
366369
avgScore: evalResult?.overall_metrics?.average_overall,

evals/git-evals/run-git-evals.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,7 @@ export async function runGitEvals(
346346
codingAgent: 'codebuff' | 'claude',
347347
limit?: number,
348348
logToStdout: boolean = false,
349+
agent: string = 'base',
349350
): Promise<FullEvalLog> {
350351
// Set up signal handlers if this is the main module
351352
if (require.main === module) {
@@ -453,6 +454,7 @@ export async function runGitEvals(
453454
clientSessionId,
454455
fingerprintId,
455456
codingAgent,
457+
agent,
456458
],
457459
{
458460
stdio: ['pipe', 'pipe', 'pipe', 'ipc'],

evals/git-evals/run-single-eval-process.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,16 @@ async function main() {
3030
clientSessionId,
3131
fingerprintId,
3232
codingAgent,
33+
agent,
3334
] = process.argv.slice(2)
3435

3536
if (
3637
!evalCommitFilePath ||
3738
!projectPath ||
3839
!clientSessionId ||
3940
!fingerprintId ||
40-
!codingAgent
41+
!codingAgent ||
42+
!agent
4143
) {
4244
console.error('Missing required arguments for single eval process')
4345
process.exit(1)
@@ -71,6 +73,7 @@ async function main() {
7173
clientSessionId,
7274
fingerprintId,
7375
codingAgent as any,
76+
agent,
7477
)
7578

7679
// Check again after long-running operation

evals/git-evals/types.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,6 @@ export interface EvalConfig {
104104
name: string
105105
evalDataPath: string
106106
outputDir: string
107-
agentType?: string
108107
limit?: number
109108
}
110109

0 commit comments

Comments
 (0)