From 4a45ccddad87ba98c53524e0cf927e083a75331a Mon Sep 17 00:00:00 2001 From: Josh Lambert Date: Wed, 3 Jun 2026 10:32:27 -0400 Subject: [PATCH 1/2] feat(ai-gateway): enrich catalog with Terminal Bench stats --- .../ai-gateway/providers/openrouter/index.ts | 9 +- .../lib/model-stats/terminal-bench.test.ts | 110 ++++++++++++++++++ .../web/src/lib/model-stats/terminal-bench.ts | 83 +++++++++++++ .../lib/organizations/organization-types.ts | 6 + apps/web/src/tests/openrouter-models.test.ts | 38 +++++- 5 files changed, 240 insertions(+), 6 deletions(-) create mode 100644 apps/web/src/lib/model-stats/terminal-bench.test.ts create mode 100644 apps/web/src/lib/model-stats/terminal-bench.ts diff --git a/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts b/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts index e48c4a1437..8facb59313 100644 --- a/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts +++ b/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts @@ -20,6 +20,7 @@ import { ATTRIBUTION_HEADERS } from '@/lib/ai-gateway/providers/openrouter/attri import { getOpenRouterModelsMetadata } from '@/lib/ai-gateway/providers/gateway-models-cache'; import { getPreferredProviderOrder } from '@/lib/ai-gateway/providers/apply-provider-specific-logic'; import { normalizeInferenceProviderId } from '@/lib/ai-gateway/providers/openrouter/inference-provider-id'; +import { getTerminalBenchSummaries, terminalBenchFor } from '@/lib/model-stats/terminal-bench'; // Re-export from shared module for backwards compatibility export { normalizeModelId } from '@/lib/ai-gateway/model-utils'; @@ -93,6 +94,7 @@ export function formatName(model: OpenRouterModel, preferredIndex: number) { async function enhancedModelList(models: OpenRouterModel[]) { const autoModels = buildAutoModels(); const endpointsMetadata = await getOpenRouterModelsMetadata(); + const summaries = await getTerminalBenchSummaries(); const enhancedModels = await Promise.all( models .filter( @@ -111,7 +113,12 @@ async function enhancedModelList(models: OpenRouterModel[]) { normalizeInferenceProviderId(preferredProvider) )?.pricing) : undefined; - return pricing ? { ...model, pricing } : model; + const terminalBench = terminalBenchFor(summaries, model.id); + return { + ...model, + ...(pricing && { pricing }), + ...(terminalBench && { terminalBench }), + }; }) .concat( kiloExclusiveModels diff --git a/apps/web/src/lib/model-stats/terminal-bench.test.ts b/apps/web/src/lib/model-stats/terminal-bench.test.ts new file mode 100644 index 0000000000..43a91a36f8 --- /dev/null +++ b/apps/web/src/lib/model-stats/terminal-bench.test.ts @@ -0,0 +1,110 @@ +import { describe, expect, it, jest } from '@jest/globals'; +import { + createTerminalBenchFetch, + summarizeTerminalBench, + terminalBenchFor, + type TerminalBenchSummaries, +} from './terminal-bench'; + +const summary = { overallScore: 0.551, avgAttemptCostUsd: 53.37 }; + +function benchmarks( + overrides: Partial<{ nAttempts: number | null; avgAttemptCostUsd: number | null }> = {} +) { + return { + kiloBench: { + overallScore: 0.4, + evals: { + 'terminal-bench': { + taskSource: 'terminal-bench', + overallScore: summary.overallScore, + totalScore: 2.755, + avgCostUsd: 1, + avgInputTokens: 1, + avgOutputTokens: 1, + avgCacheReadTokens: 1, + avgExecutionMs: 1, + nTotalTrials: 5, + nAttempts: 5, + avgAttemptCostUsd: summary.avgAttemptCostUsd, + avgAttemptInputTokens: 1, + avgAttemptOutputTokens: 1, + avgAttemptCacheReadTokens: 1, + nErrored: 0, + lastPromotedAt: '2026-06-03T00:00:00.000Z', + ...overrides, + }, + }, + }, + }; +} + +function row(overrides: Partial[0][number]> = {}) { + return { + openrouterId: 'openai/model', + isActive: true, + isStealth: false, + benchmarks: benchmarks(), + ...overrides, + }; +} + +describe('summarizeTerminalBench', () => { + it('extracts publishable summaries keyed by OpenRouter ID', () => { + expect(summarizeTerminalBench([row()])).toEqual(new Map([['openai/model', summary]])); + }); + + it.each([ + ['inactive', row({ isActive: false })], + ['stealth', row({ isStealth: true })], + ['fewer than five attempts', row({ benchmarks: benchmarks({ nAttempts: 4 }) })], + ['null attempt cost', row({ benchmarks: benchmarks({ avgAttemptCostUsd: null }) })], + ['missing eval', row({ benchmarks: { kiloBench: { overallScore: 0.4, evals: {} } } })], + ['invalid benchmarks', row({ benchmarks: { kiloBench: { overallScore: 'invalid' } } })], + ])('omits %s rows', (_name, input) => { + expect(summarizeTerminalBench([input])).toEqual(new Map()); + }); +}); + +describe('terminalBenchFor', () => { + const summaries = new Map([['openai/model', summary]]); + + it('matches exact OpenRouter IDs', () => { + expect(terminalBenchFor(summaries, 'openai/model')).toEqual(summary); + }); + + it('matches safely prefixed Kilo gateway IDs', () => { + expect(terminalBenchFor(summaries, 'kilo/openai/model')).toEqual(summary); + }); + + it('does not strip ambiguous Kilo-owned IDs', () => { + expect(terminalBenchFor(summaries, 'kilo/special-model')).toBeUndefined(); + }); +}); + +describe('createTerminalBenchFetch', () => { + it('falls back to an empty map when the first lookup fails', async () => { + jest.spyOn(console, 'error').mockImplementation(() => undefined); + const load = jest + .fn<() => Promise>() + .mockRejectedValue(new Error('lookup failed')); + const get = createTerminalBenchFetch(load); + + expect(await get()).toEqual(new Map()); + expect(console.error).toHaveBeenCalled(); + }); + + it('falls back to the last-known-good map when refresh fails', async () => { + jest.spyOn(console, 'error').mockImplementation(() => undefined); + const good = new Map([['openai/model', summary]]); + const load = jest + .fn<() => Promise>() + .mockResolvedValueOnce(good) + .mockRejectedValueOnce(new Error('lookup failed')); + const get = createTerminalBenchFetch(load); + + expect(await get()).toBe(good); + expect(await get()).toBe(good); + expect(console.error).toHaveBeenCalled(); + }); +}); diff --git a/apps/web/src/lib/model-stats/terminal-bench.ts b/apps/web/src/lib/model-stats/terminal-bench.ts new file mode 100644 index 0000000000..ed972d9b75 --- /dev/null +++ b/apps/web/src/lib/model-stats/terminal-bench.ts @@ -0,0 +1,83 @@ +import { createCachedFetch } from '@/lib/cached-fetch'; +import { readDb } from '@/lib/drizzle'; +import { ModelStatsBenchmarksSchema, modelStats } from '@kilocode/db/schema'; +import { unprefixKiloGatewayModelId } from '@kilocode/worker-utils/kilo-model-id'; +import { and, eq } from 'drizzle-orm'; + +const TTL = process.env.NODE_ENV === 'test' ? 0 : 5 * 60 * 1000; + +export type TerminalBenchSummary = { + overallScore: number; + avgAttemptCostUsd: number; +}; + +export type TerminalBenchSummaries = ReadonlyMap; + +type Row = { + openrouterId: string; + isActive: boolean | null; + isStealth: boolean; + benchmarks: unknown; +}; + +export function summarizeTerminalBench(rows: readonly Row[]): TerminalBenchSummaries { + const summaries = new Map(); + + for (const row of rows) { + if (!row.isActive || row.isStealth) continue; + const result = ModelStatsBenchmarksSchema.safeParse(row.benchmarks); + if (!result.success) continue; + const bench = result.data?.kiloBench?.evals['terminal-bench']; + if ( + !bench || + (bench.nAttempts ?? 0) < 5 || + bench.avgAttemptCostUsd === null || + bench.avgAttemptCostUsd === undefined + ) { + continue; + } + summaries.set(row.openrouterId, { + overallScore: bench.overallScore, + avgAttemptCostUsd: bench.avgAttemptCostUsd, + }); + } + + return summaries; +} + +export function terminalBenchFor( + summaries: TerminalBenchSummaries, + id: string +): TerminalBenchSummary | undefined { + const exact = summaries.get(id); + if (exact) return exact; + const unprefixed = unprefixKiloGatewayModelId(id); + return unprefixed ? summaries.get(unprefixed) : undefined; +} + +async function loadTerminalBench(): Promise { + const rows = await readDb + .select({ + openrouterId: modelStats.openrouterId, + isActive: modelStats.isActive, + isStealth: modelStats.isStealth, + benchmarks: modelStats.benchmarks, + }) + .from(modelStats) + .where(and(eq(modelStats.isActive, true), eq(modelStats.isStealth, false))); + return summarizeTerminalBench(rows); +} + +export function createTerminalBenchFetch(load = loadTerminalBench) { + return createCachedFetch( + () => + load().catch(err => { + console.error('[terminal-bench] Failed to load model summaries:', err); + throw err; + }), + TTL, + new Map() + ); +} + +export const getTerminalBenchSummaries = createTerminalBenchFetch(); diff --git a/apps/web/src/lib/organizations/organization-types.ts b/apps/web/src/lib/organizations/organization-types.ts index 0dd02aa5bc..84d52413cc 100644 --- a/apps/web/src/lib/organizations/organization-types.ts +++ b/apps/web/src/lib/organizations/organization-types.ts @@ -199,6 +199,12 @@ const OpenRouterModelSchema = z.object({ // kilocode additions: preferredIndex: z.number().optional(), isFree: z.boolean().optional(), + terminalBench: z + .object({ + overallScore: z.number(), + avgAttemptCostUsd: z.number(), + }) + .optional(), opencode: OpenCodeSettingsSchema.optional(), id: z.string(), diff --git a/apps/web/src/tests/openrouter-models.test.ts b/apps/web/src/tests/openrouter-models.test.ts index 044e8d5857..f77333fcd8 100644 --- a/apps/web/src/tests/openrouter-models.test.ts +++ b/apps/web/src/tests/openrouter-models.test.ts @@ -1,15 +1,22 @@ -import { test, expect, describe, afterEach, jest, beforeEach } from '@jest/globals'; +import { test, expect, describe, afterEach, beforeEach } from '@jest/globals'; import { mockOpenRouterModels, createMockResponse } from './helpers/openrouter-models.helper'; import { GET } from '../app/api/openrouter/models/route'; import { NextRequest } from 'next/server'; jest.mock('@/lib/user/server', () => ({ - getUserByAuthorizationHeader: jest.fn().mockImplementation(async () => ({ + getUserFromAuth: jest.fn(async () => ({ user: { id: 'test-user-id' }, - authFailedResponse: null, + organizationId: null, })), })); +jest.mock('@/lib/model-stats/terminal-bench', () => ({ + getTerminalBenchSummaries: jest.fn( + async () => new Map([['some-other-model', { overallScore: 0.551, avgAttemptCostUsd: 53.37 }]]) + ), + terminalBenchFor: jest.fn((summaries: Map, id: string) => summaries.get(id)), +})); + function createTestRequest(path: string) { return new NextRequest(new URL(path, 'http://localhost:3000'), { method: 'GET', @@ -18,8 +25,7 @@ function createTestRequest(path: string) { describe('GET /api/openrouter/models', () => { beforeEach(() => { - // Reset all mocks before each test - jest.resetAllMocks(); + jest.clearAllMocks(); }); test('should handle OpenRouter API errors', async () => { @@ -86,6 +92,28 @@ describe('GET /api/openrouter/models', () => { expect(responseData.data).toBeDefined(); expect(Array.isArray(responseData.data)).toBe(true); }); + + test('should include publishable Terminal Bench summaries for canonical models', async () => { + const request = createTestRequest('/api/openrouter/models'); + + global.fetch = jest.fn(() => { + return Promise.resolve( + createMockResponse({ + ok: true, + status: 200, + statusText: 'OK', + jsonData: mockOpenRouterModels, + }) + ); + }) as unknown as typeof fetch; + + const response = await GET(request); + const responseData = await response.json(); + const model = responseData.data.find((item: { id: string }) => item.id === 'some-other-model'); + + expect(response.status).toBe(200); + expect(model.terminalBench).toEqual({ overallScore: 0.551, avgAttemptCostUsd: 53.37 }); + }); }); afterEach(() => { From 22589e817d0bc94d4f53f1e8e40192d014b31ca6 Mon Sep 17 00:00:00 2001 From: Josh Lambert Date: Fri, 5 Jun 2026 16:41:49 -0400 Subject: [PATCH 2/2] test(ai-gateway): focus Terminal Bench coverage --- .../lib/model-stats/terminal-bench.test.ts | 70 ++++--------------- .../web/src/lib/model-stats/terminal-bench.ts | 2 +- 2 files changed, 16 insertions(+), 56 deletions(-) diff --git a/apps/web/src/lib/model-stats/terminal-bench.test.ts b/apps/web/src/lib/model-stats/terminal-bench.test.ts index 43a91a36f8..26d211e04a 100644 --- a/apps/web/src/lib/model-stats/terminal-bench.test.ts +++ b/apps/web/src/lib/model-stats/terminal-bench.test.ts @@ -1,10 +1,5 @@ -import { describe, expect, it, jest } from '@jest/globals'; -import { - createTerminalBenchFetch, - summarizeTerminalBench, - terminalBenchFor, - type TerminalBenchSummaries, -} from './terminal-bench'; +import { describe, expect, it } from '@jest/globals'; +import { summarizeTerminalBench, terminalBenchFor } from './terminal-bench'; const summary = { overallScore: 0.551, avgAttemptCostUsd: 53.37 }; @@ -50,61 +45,26 @@ function row(overrides: Partial[0][num } describe('summarizeTerminalBench', () => { - it('extracts publishable summaries keyed by OpenRouter ID', () => { - expect(summarizeTerminalBench([row()])).toEqual(new Map([['openai/model', summary]])); - }); + it('publishes only eligible summaries', () => { + const summaries = summarizeTerminalBench([ + row(), + row({ isActive: false }), + row({ isStealth: true }), + row({ benchmarks: benchmarks({ nAttempts: 4 }) }), + row({ benchmarks: benchmarks({ avgAttemptCostUsd: null }) }), + row({ benchmarks: { kiloBench: { overallScore: 0.4, evals: {} } } }), + row({ benchmarks: { kiloBench: { overallScore: 'invalid' } } }), + ]); - it.each([ - ['inactive', row({ isActive: false })], - ['stealth', row({ isStealth: true })], - ['fewer than five attempts', row({ benchmarks: benchmarks({ nAttempts: 4 }) })], - ['null attempt cost', row({ benchmarks: benchmarks({ avgAttemptCostUsd: null }) })], - ['missing eval', row({ benchmarks: { kiloBench: { overallScore: 0.4, evals: {} } } })], - ['invalid benchmarks', row({ benchmarks: { kiloBench: { overallScore: 'invalid' } } })], - ])('omits %s rows', (_name, input) => { - expect(summarizeTerminalBench([input])).toEqual(new Map()); + expect(summaries).toEqual(new Map([['openai/model', summary]])); }); }); describe('terminalBenchFor', () => { - const summaries = new Map([['openai/model', summary]]); - - it('matches exact OpenRouter IDs', () => { - expect(terminalBenchFor(summaries, 'openai/model')).toEqual(summary); - }); + it('matches only safe canonical IDs', () => { + const summaries = new Map([['openai/model', summary]]); - it('matches safely prefixed Kilo gateway IDs', () => { expect(terminalBenchFor(summaries, 'kilo/openai/model')).toEqual(summary); - }); - - it('does not strip ambiguous Kilo-owned IDs', () => { expect(terminalBenchFor(summaries, 'kilo/special-model')).toBeUndefined(); }); }); - -describe('createTerminalBenchFetch', () => { - it('falls back to an empty map when the first lookup fails', async () => { - jest.spyOn(console, 'error').mockImplementation(() => undefined); - const load = jest - .fn<() => Promise>() - .mockRejectedValue(new Error('lookup failed')); - const get = createTerminalBenchFetch(load); - - expect(await get()).toEqual(new Map()); - expect(console.error).toHaveBeenCalled(); - }); - - it('falls back to the last-known-good map when refresh fails', async () => { - jest.spyOn(console, 'error').mockImplementation(() => undefined); - const good = new Map([['openai/model', summary]]); - const load = jest - .fn<() => Promise>() - .mockResolvedValueOnce(good) - .mockRejectedValueOnce(new Error('lookup failed')); - const get = createTerminalBenchFetch(load); - - expect(await get()).toBe(good); - expect(await get()).toBe(good); - expect(console.error).toHaveBeenCalled(); - }); -}); diff --git a/apps/web/src/lib/model-stats/terminal-bench.ts b/apps/web/src/lib/model-stats/terminal-bench.ts index ed972d9b75..eb692e6452 100644 --- a/apps/web/src/lib/model-stats/terminal-bench.ts +++ b/apps/web/src/lib/model-stats/terminal-bench.ts @@ -68,7 +68,7 @@ async function loadTerminalBench(): Promise { return summarizeTerminalBench(rows); } -export function createTerminalBenchFetch(load = loadTerminalBench) { +function createTerminalBenchFetch(load = loadTerminalBench) { return createCachedFetch( () => load().catch(err => {