diff --git a/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts b/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts index 1bae3a8085..e9be5b824c 100644 --- a/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts +++ b/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts @@ -24,6 +24,7 @@ import { ATTRIBUTION_HEADERS } from '@/lib/ai-gateway/providers/openrouter/attri import { getOpenRouterModelsMetadata } from '@/lib/ai-gateway/providers/gateway-models-cache'; import { getPreferredProviderOrder } from '@/lib/ai-gateway/providers/apply-provider-specific-logic'; import { normalizeInferenceProviderId } from '@/lib/ai-gateway/providers/openrouter/inference-provider-id'; +import { getTerminalBenchSummaries, terminalBenchFor } from '@/lib/model-stats/terminal-bench'; import { isFreeNemotronModel, NVIDIA_TRIAL_TOS } from '@/lib/ai-gateway/providers/nvidia'; // Re-export from shared module for backwards compatibility @@ -119,6 +120,7 @@ export function shouldSuppressOpenRouterModel(model: KiloExclusiveModel): boolea async function enhancedModelList(models: OpenRouterModel[]) { const autoModels = buildAutoModels(); const endpointsMetadata = await getOpenRouterModelsMetadata(); + const summaries = await getTerminalBenchSummaries(); const enhancedModels = await Promise.all( models .filter( @@ -139,7 +141,12 @@ async function enhancedModelList(models: OpenRouterModel[]) { normalizeInferenceProviderId(preferredProvider) )?.pricing); const pricing = rawPricing ? undoPricingDiscount(rawPricing) : rawPricing; - return pricing ? { ...model, pricing } : model; + const terminalBench = terminalBenchFor(summaries, model.id); + return { + ...model, + ...(pricing && { pricing }), + ...(terminalBench && { terminalBench }), + }; }) .concat( kiloExclusiveModels diff --git a/apps/web/src/lib/model-stats/terminal-bench.test.ts b/apps/web/src/lib/model-stats/terminal-bench.test.ts new file mode 100644 index 0000000000..26d211e04a --- /dev/null +++ b/apps/web/src/lib/model-stats/terminal-bench.test.ts @@ -0,0 +1,70 @@ +import { describe, expect, it } from '@jest/globals'; +import { summarizeTerminalBench, terminalBenchFor } from './terminal-bench'; + +const summary = { overallScore: 0.551, avgAttemptCostUsd: 53.37 }; + +function benchmarks( + overrides: Partial<{ nAttempts: number | null; avgAttemptCostUsd: number | null }> = {} +) { + return { + kiloBench: { + overallScore: 0.4, + evals: { + 'terminal-bench': { + taskSource: 'terminal-bench', + overallScore: summary.overallScore, + totalScore: 2.755, + avgCostUsd: 1, + avgInputTokens: 1, + avgOutputTokens: 1, + avgCacheReadTokens: 1, + avgExecutionMs: 1, + nTotalTrials: 5, + nAttempts: 5, + avgAttemptCostUsd: summary.avgAttemptCostUsd, + avgAttemptInputTokens: 1, + avgAttemptOutputTokens: 1, + avgAttemptCacheReadTokens: 1, + nErrored: 0, + lastPromotedAt: '2026-06-03T00:00:00.000Z', + ...overrides, + }, + }, + }, + }; +} + +function row(overrides: Partial[0][number]> = {}) { + return { + openrouterId: 'openai/model', + isActive: true, + isStealth: false, + benchmarks: benchmarks(), + ...overrides, + }; +} + +describe('summarizeTerminalBench', () => { + it('publishes only eligible summaries', () => { + const summaries = summarizeTerminalBench([ + row(), + row({ isActive: false }), + row({ isStealth: true }), + row({ benchmarks: benchmarks({ nAttempts: 4 }) }), + row({ benchmarks: benchmarks({ avgAttemptCostUsd: null }) }), + row({ benchmarks: { kiloBench: { overallScore: 0.4, evals: {} } } }), + row({ benchmarks: { kiloBench: { overallScore: 'invalid' } } }), + ]); + + expect(summaries).toEqual(new Map([['openai/model', summary]])); + }); +}); + +describe('terminalBenchFor', () => { + it('matches only safe canonical IDs', () => { + const summaries = new Map([['openai/model', summary]]); + + expect(terminalBenchFor(summaries, 'kilo/openai/model')).toEqual(summary); + expect(terminalBenchFor(summaries, 'kilo/special-model')).toBeUndefined(); + }); +}); diff --git a/apps/web/src/lib/model-stats/terminal-bench.ts b/apps/web/src/lib/model-stats/terminal-bench.ts new file mode 100644 index 0000000000..eb692e6452 --- /dev/null +++ b/apps/web/src/lib/model-stats/terminal-bench.ts @@ -0,0 +1,83 @@ +import { createCachedFetch } from '@/lib/cached-fetch'; +import { readDb } from '@/lib/drizzle'; +import { ModelStatsBenchmarksSchema, modelStats } from '@kilocode/db/schema'; +import { unprefixKiloGatewayModelId } from '@kilocode/worker-utils/kilo-model-id'; +import { and, eq } from 'drizzle-orm'; + +const TTL = process.env.NODE_ENV === 'test' ? 0 : 5 * 60 * 1000; + +export type TerminalBenchSummary = { + overallScore: number; + avgAttemptCostUsd: number; +}; + +export type TerminalBenchSummaries = ReadonlyMap; + +type Row = { + openrouterId: string; + isActive: boolean | null; + isStealth: boolean; + benchmarks: unknown; +}; + +export function summarizeTerminalBench(rows: readonly Row[]): TerminalBenchSummaries { + const summaries = new Map(); + + for (const row of rows) { + if (!row.isActive || row.isStealth) continue; + const result = ModelStatsBenchmarksSchema.safeParse(row.benchmarks); + if (!result.success) continue; + const bench = result.data?.kiloBench?.evals['terminal-bench']; + if ( + !bench || + (bench.nAttempts ?? 0) < 5 || + bench.avgAttemptCostUsd === null || + bench.avgAttemptCostUsd === undefined + ) { + continue; + } + summaries.set(row.openrouterId, { + overallScore: bench.overallScore, + avgAttemptCostUsd: bench.avgAttemptCostUsd, + }); + } + + return summaries; +} + +export function terminalBenchFor( + summaries: TerminalBenchSummaries, + id: string +): TerminalBenchSummary | undefined { + const exact = summaries.get(id); + if (exact) return exact; + const unprefixed = unprefixKiloGatewayModelId(id); + return unprefixed ? summaries.get(unprefixed) : undefined; +} + +async function loadTerminalBench(): Promise { + const rows = await readDb + .select({ + openrouterId: modelStats.openrouterId, + isActive: modelStats.isActive, + isStealth: modelStats.isStealth, + benchmarks: modelStats.benchmarks, + }) + .from(modelStats) + .where(and(eq(modelStats.isActive, true), eq(modelStats.isStealth, false))); + return summarizeTerminalBench(rows); +} + +function createTerminalBenchFetch(load = loadTerminalBench) { + return createCachedFetch( + () => + load().catch(err => { + console.error('[terminal-bench] Failed to load model summaries:', err); + throw err; + }), + TTL, + new Map() + ); +} + +export const getTerminalBenchSummaries = createTerminalBenchFetch(); diff --git a/apps/web/src/lib/organizations/organization-types.ts b/apps/web/src/lib/organizations/organization-types.ts index 0dd02aa5bc..84d52413cc 100644 --- a/apps/web/src/lib/organizations/organization-types.ts +++ b/apps/web/src/lib/organizations/organization-types.ts @@ -199,6 +199,12 @@ const OpenRouterModelSchema = z.object({ // kilocode additions: preferredIndex: z.number().optional(), isFree: z.boolean().optional(), + terminalBench: z + .object({ + overallScore: z.number(), + avgAttemptCostUsd: z.number(), + }) + .optional(), opencode: OpenCodeSettingsSchema.optional(), id: z.string(), diff --git a/apps/web/src/tests/openrouter-models.test.ts b/apps/web/src/tests/openrouter-models.test.ts index 044e8d5857..f77333fcd8 100644 --- a/apps/web/src/tests/openrouter-models.test.ts +++ b/apps/web/src/tests/openrouter-models.test.ts @@ -1,15 +1,22 @@ -import { test, expect, describe, afterEach, jest, beforeEach } from '@jest/globals'; +import { test, expect, describe, afterEach, beforeEach } from '@jest/globals'; import { mockOpenRouterModels, createMockResponse } from './helpers/openrouter-models.helper'; import { GET } from '../app/api/openrouter/models/route'; import { NextRequest } from 'next/server'; jest.mock('@/lib/user/server', () => ({ - getUserByAuthorizationHeader: jest.fn().mockImplementation(async () => ({ + getUserFromAuth: jest.fn(async () => ({ user: { id: 'test-user-id' }, - authFailedResponse: null, + organizationId: null, })), })); +jest.mock('@/lib/model-stats/terminal-bench', () => ({ + getTerminalBenchSummaries: jest.fn( + async () => new Map([['some-other-model', { overallScore: 0.551, avgAttemptCostUsd: 53.37 }]]) + ), + terminalBenchFor: jest.fn((summaries: Map, id: string) => summaries.get(id)), +})); + function createTestRequest(path: string) { return new NextRequest(new URL(path, 'http://localhost:3000'), { method: 'GET', @@ -18,8 +25,7 @@ function createTestRequest(path: string) { describe('GET /api/openrouter/models', () => { beforeEach(() => { - // Reset all mocks before each test - jest.resetAllMocks(); + jest.clearAllMocks(); }); test('should handle OpenRouter API errors', async () => { @@ -86,6 +92,28 @@ describe('GET /api/openrouter/models', () => { expect(responseData.data).toBeDefined(); expect(Array.isArray(responseData.data)).toBe(true); }); + + test('should include publishable Terminal Bench summaries for canonical models', async () => { + const request = createTestRequest('/api/openrouter/models'); + + global.fetch = jest.fn(() => { + return Promise.resolve( + createMockResponse({ + ok: true, + status: 200, + statusText: 'OK', + jsonData: mockOpenRouterModels, + }) + ); + }) as unknown as typeof fetch; + + const response = await GET(request); + const responseData = await response.json(); + const model = responseData.data.find((item: { id: string }) => item.id === 'some-other-model'); + + expect(response.status).toBe(200); + expect(model.terminalBench).toEqual({ overallScore: 0.551, avgAttemptCostUsd: 53.37 }); + }); }); afterEach(() => {