Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion apps/web/src/lib/ai-gateway/providers/openrouter/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import { ATTRIBUTION_HEADERS } from '@/lib/ai-gateway/providers/openrouter/attri
import { getOpenRouterModelsMetadata } from '@/lib/ai-gateway/providers/gateway-models-cache';
import { getPreferredProviderOrder } from '@/lib/ai-gateway/providers/apply-provider-specific-logic';
import { normalizeInferenceProviderId } from '@/lib/ai-gateway/providers/openrouter/inference-provider-id';
import { getTerminalBenchSummaries, terminalBenchFor } from '@/lib/model-stats/terminal-bench';
import { isFreeNemotronModel, NVIDIA_TRIAL_TOS } from '@/lib/ai-gateway/providers/nvidia';

// Re-export from shared module for backwards compatibility
Expand Down Expand Up @@ -119,6 +120,7 @@ export function shouldSuppressOpenRouterModel(model: KiloExclusiveModel): boolea
async function enhancedModelList(models: OpenRouterModel[]) {
const autoModels = buildAutoModels();
const endpointsMetadata = await getOpenRouterModelsMetadata();
const summaries = await getTerminalBenchSummaries();
const enhancedModels = await Promise.all(
models
.filter(
Expand All @@ -139,7 +141,12 @@ async function enhancedModelList(models: OpenRouterModel[]) {
normalizeInferenceProviderId(preferredProvider)
)?.pricing);
const pricing = rawPricing ? undoPricingDiscount(rawPricing) : rawPricing;
return pricing ? { ...model, pricing } : model;
const terminalBench = terminalBenchFor(summaries, model.id);
return {
...model,
...(pricing && { pricing }),
...(terminalBench && { terminalBench }),
};
})
.concat(
kiloExclusiveModels
Expand Down
70 changes: 70 additions & 0 deletions apps/web/src/lib/model-stats/terminal-bench.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import { describe, expect, it } from '@jest/globals';
import { summarizeTerminalBench, terminalBenchFor } from './terminal-bench';

const summary = { overallScore: 0.551, avgAttemptCostUsd: 53.37 };

function benchmarks(
overrides: Partial<{ nAttempts: number | null; avgAttemptCostUsd: number | null }> = {}
) {
return {
kiloBench: {
overallScore: 0.4,
evals: {
'terminal-bench': {
taskSource: 'terminal-bench',
overallScore: summary.overallScore,
totalScore: 2.755,
avgCostUsd: 1,
avgInputTokens: 1,
avgOutputTokens: 1,
avgCacheReadTokens: 1,
avgExecutionMs: 1,
nTotalTrials: 5,
nAttempts: 5,
avgAttemptCostUsd: summary.avgAttemptCostUsd,
avgAttemptInputTokens: 1,
avgAttemptOutputTokens: 1,
avgAttemptCacheReadTokens: 1,
nErrored: 0,
lastPromotedAt: '2026-06-03T00:00:00.000Z',
...overrides,
},
},
},
};
}

function row(overrides: Partial<Parameters<typeof summarizeTerminalBench>[0][number]> = {}) {
return {
openrouterId: 'openai/model',
isActive: true,
isStealth: false,
benchmarks: benchmarks(),
...overrides,
};
}

describe('summarizeTerminalBench', () => {
it('publishes only eligible summaries', () => {
const summaries = summarizeTerminalBench([
row(),
row({ isActive: false }),
row({ isStealth: true }),
row({ benchmarks: benchmarks({ nAttempts: 4 }) }),
row({ benchmarks: benchmarks({ avgAttemptCostUsd: null }) }),
row({ benchmarks: { kiloBench: { overallScore: 0.4, evals: {} } } }),
row({ benchmarks: { kiloBench: { overallScore: 'invalid' } } }),
]);

expect(summaries).toEqual(new Map([['openai/model', summary]]));
});
});

describe('terminalBenchFor', () => {
it('matches only safe canonical IDs', () => {
const summaries = new Map([['openai/model', summary]]);

expect(terminalBenchFor(summaries, 'kilo/openai/model')).toEqual(summary);
expect(terminalBenchFor(summaries, 'kilo/special-model')).toBeUndefined();
});
});
83 changes: 83 additions & 0 deletions apps/web/src/lib/model-stats/terminal-bench.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import { createCachedFetch } from '@/lib/cached-fetch';
import { readDb } from '@/lib/drizzle';
import { ModelStatsBenchmarksSchema, modelStats } from '@kilocode/db/schema';
import { unprefixKiloGatewayModelId } from '@kilocode/worker-utils/kilo-model-id';
import { and, eq } from 'drizzle-orm';

const TTL = process.env.NODE_ENV === 'test' ? 0 : 5 * 60 * 1000;

export type TerminalBenchSummary = {
overallScore: number;
avgAttemptCostUsd: number;
};

export type TerminalBenchSummaries = ReadonlyMap<string, TerminalBenchSummary>;

type Row = {
openrouterId: string;
isActive: boolean | null;
isStealth: boolean;
benchmarks: unknown;
};

export function summarizeTerminalBench(rows: readonly Row[]): TerminalBenchSummaries {
const summaries = new Map<string, TerminalBenchSummary>();

for (const row of rows) {
if (!row.isActive || row.isStealth) continue;
const result = ModelStatsBenchmarksSchema.safeParse(row.benchmarks);
if (!result.success) continue;
const bench = result.data?.kiloBench?.evals['terminal-bench'];
if (
!bench ||
(bench.nAttempts ?? 0) < 5 ||
bench.avgAttemptCostUsd === null ||
bench.avgAttemptCostUsd === undefined
) {
continue;
}
summaries.set(row.openrouterId, {
overallScore: bench.overallScore,
avgAttemptCostUsd: bench.avgAttemptCostUsd,
});
}

return summaries;
}

export function terminalBenchFor(
summaries: TerminalBenchSummaries,
id: string
): TerminalBenchSummary | undefined {
const exact = summaries.get(id);
if (exact) return exact;
const unprefixed = unprefixKiloGatewayModelId(id);
return unprefixed ? summaries.get(unprefixed) : undefined;
}

async function loadTerminalBench(): Promise<TerminalBenchSummaries> {
const rows = await readDb
.select({
openrouterId: modelStats.openrouterId,
isActive: modelStats.isActive,
isStealth: modelStats.isStealth,
benchmarks: modelStats.benchmarks,
})
.from(modelStats)
.where(and(eq(modelStats.isActive, true), eq(modelStats.isStealth, false)));
return summarizeTerminalBench(rows);
}

function createTerminalBenchFetch(load = loadTerminalBench) {
return createCachedFetch(
() =>
load().catch(err => {
console.error('[terminal-bench] Failed to load model summaries:', err);
throw err;
}),
TTL,
new Map<string, TerminalBenchSummary>()
);
}

export const getTerminalBenchSummaries = createTerminalBenchFetch();
6 changes: 6 additions & 0 deletions apps/web/src/lib/organizations/organization-types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,12 @@ const OpenRouterModelSchema = z.object({
// kilocode additions:
preferredIndex: z.number().optional(),
isFree: z.boolean().optional(),
terminalBench: z
.object({
overallScore: z.number(),
avgAttemptCostUsd: z.number(),
})
.optional(),
opencode: OpenCodeSettingsSchema.optional(),

id: z.string(),
Expand Down
38 changes: 33 additions & 5 deletions apps/web/src/tests/openrouter-models.test.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
import { test, expect, describe, afterEach, jest, beforeEach } from '@jest/globals';
import { test, expect, describe, afterEach, beforeEach } from '@jest/globals';
import { mockOpenRouterModels, createMockResponse } from './helpers/openrouter-models.helper';
import { GET } from '../app/api/openrouter/models/route';
import { NextRequest } from 'next/server';

jest.mock('@/lib/user/server', () => ({
getUserByAuthorizationHeader: jest.fn().mockImplementation(async () => ({
getUserFromAuth: jest.fn(async () => ({
user: { id: 'test-user-id' },
authFailedResponse: null,
organizationId: null,
})),
}));

jest.mock('@/lib/model-stats/terminal-bench', () => ({
getTerminalBenchSummaries: jest.fn(
async () => new Map([['some-other-model', { overallScore: 0.551, avgAttemptCostUsd: 53.37 }]])
),
terminalBenchFor: jest.fn((summaries: Map<string, unknown>, id: string) => summaries.get(id)),
}));

function createTestRequest(path: string) {
return new NextRequest(new URL(path, 'http://localhost:3000'), {
method: 'GET',
Expand All @@ -18,8 +25,7 @@ function createTestRequest(path: string) {

describe('GET /api/openrouter/models', () => {
beforeEach(() => {
// Reset all mocks before each test
jest.resetAllMocks();
jest.clearAllMocks();
});

test('should handle OpenRouter API errors', async () => {
Expand Down Expand Up @@ -86,6 +92,28 @@ describe('GET /api/openrouter/models', () => {
expect(responseData.data).toBeDefined();
expect(Array.isArray(responseData.data)).toBe(true);
});

test('should include publishable Terminal Bench summaries for canonical models', async () => {
const request = createTestRequest('/api/openrouter/models');

global.fetch = jest.fn(() => {
return Promise.resolve(
createMockResponse({
ok: true,
status: 200,
statusText: 'OK',
jsonData: mockOpenRouterModels,
})
);
}) as unknown as typeof fetch;

const response = await GET(request);
const responseData = await response.json();
const model = responseData.data.find((item: { id: string }) => item.id === 'some-other-model');

expect(response.status).toBe(200);
expect(model.terminalBench).toEqual({ overallScore: 0.551, avgAttemptCostUsd: 53.37 });
});
});

afterEach(() => {
Expand Down