From 4a45ccddad87ba98c53524e0cf927e083a75331a Mon Sep 17 00:00:00 2001
From: Josh Lambert <josh@kilocode.ai>
Date: Wed, 3 Jun 2026 10:32:27 -0400
Subject: [PATCH 1/2] feat(ai-gateway): enrich catalog with Terminal Bench
 stats

---
 .../ai-gateway/providers/openrouter/index.ts  |   9 +-
 .../lib/model-stats/terminal-bench.test.ts    | 110 ++++++++++++++++++
 .../web/src/lib/model-stats/terminal-bench.ts |  83 +++++++++++++
 .../lib/organizations/organization-types.ts   |   6 +
 apps/web/src/tests/openrouter-models.test.ts  |  38 +++++-
 5 files changed, 240 insertions(+), 6 deletions(-)
 create mode 100644 apps/web/src/lib/model-stats/terminal-bench.test.ts
 create mode 100644 apps/web/src/lib/model-stats/terminal-bench.ts
diff --git a/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts b/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts
index e48c4a1437..8facb59313 100644
--- a/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts
+++ b/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts
@@ -20,6 +20,7 @@ import { ATTRIBUTION_HEADERS } from '@/lib/ai-gateway/providers/openrouter/attri
 import { getOpenRouterModelsMetadata } from '@/lib/ai-gateway/providers/gateway-models-cache';
 import { getPreferredProviderOrder } from '@/lib/ai-gateway/providers/apply-provider-specific-logic';
 import { normalizeInferenceProviderId } from '@/lib/ai-gateway/providers/openrouter/inference-provider-id';
+import { getTerminalBenchSummaries, terminalBenchFor } from '@/lib/model-stats/terminal-bench';
 
 // Re-export from shared module for backwards compatibility
 export { normalizeModelId } from '@/lib/ai-gateway/model-utils';
@@ -93,6 +94,7 @@ export function formatName(model: OpenRouterModel, preferredIndex: number) {
 async function enhancedModelList(models: OpenRouterModel[]) {
   const autoModels = buildAutoModels();
   const endpointsMetadata = await getOpenRouterModelsMetadata();
+  const summaries = await getTerminalBenchSummaries();
   const enhancedModels = await Promise.all(
     models
       .filter(
@@ -111,7 +113,12 @@ async function enhancedModelList(models: OpenRouterModel[]) {
                 normalizeInferenceProviderId(preferredProvider)
             )?.pricing)
           : undefined;
-        return pricing ? { ...model, pricing } : model;
+        const terminalBench = terminalBenchFor(summaries, model.id);
+        return {
+          ...model,
+          ...(pricing && { pricing }),
+          ...(terminalBench && { terminalBench }),
+        };
       })
       .concat(
         kiloExclusiveModels
diff --git a/apps/web/src/lib/model-stats/terminal-bench.test.ts b/apps/web/src/lib/model-stats/terminal-bench.test.ts
new file mode 100644
index 0000000000..43a91a36f8
--- /dev/null
+++ b/apps/web/src/lib/model-stats/terminal-bench.test.ts
@@ -0,0 +1,110 @@
+import { describe, expect, it, jest } from '@jest/globals';
+import {
+  createTerminalBenchFetch,
+  summarizeTerminalBench,
+  terminalBenchFor,
+  type TerminalBenchSummaries,
+} from './terminal-bench';
+
+const summary = { overallScore: 0.551, avgAttemptCostUsd: 53.37 };
+
+function benchmarks(
+  overrides: Partial<{ nAttempts: number | null; avgAttemptCostUsd: number | null }> = {}
+) {
+  return {
+    kiloBench: {
+      overallScore: 0.4,
+      evals: {
+        'terminal-bench': {
+          taskSource: 'terminal-bench',
+          overallScore: summary.overallScore,
+          totalScore: 2.755,
+          avgCostUsd: 1,
+          avgInputTokens: 1,
+          avgOutputTokens: 1,
+          avgCacheReadTokens: 1,
+          avgExecutionMs: 1,
+          nTotalTrials: 5,
+          nAttempts: 5,
+          avgAttemptCostUsd: summary.avgAttemptCostUsd,
+          avgAttemptInputTokens: 1,
+          avgAttemptOutputTokens: 1,
+          avgAttemptCacheReadTokens: 1,
+          nErrored: 0,
+          lastPromotedAt: '2026-06-03T00:00:00.000Z',
+          ...overrides,
+        },
+      },
+    },
+  };
+}
+
+function row(overrides: Partial<Parameters<typeof summarizeTerminalBench>[0][number]> = {}) {
+  return {
+    openrouterId: 'openai/model',
+    isActive: true,
+    isStealth: false,
+    benchmarks: benchmarks(),
+    ...overrides,
+  };
+}
+
+describe('summarizeTerminalBench', () => {
+  it('extracts publishable summaries keyed by OpenRouter ID', () => {
+    expect(summarizeTerminalBench([row()])).toEqual(new Map([['openai/model', summary]]));
+  });
+
+  it.each([
+    ['inactive', row({ isActive: false })],
+    ['stealth', row({ isStealth: true })],
+    ['fewer than five attempts', row({ benchmarks: benchmarks({ nAttempts: 4 }) })],
+    ['null attempt cost', row({ benchmarks: benchmarks({ avgAttemptCostUsd: null }) })],
+    ['missing eval', row({ benchmarks: { kiloBench: { overallScore: 0.4, evals: {} } } })],
+    ['invalid benchmarks', row({ benchmarks: { kiloBench: { overallScore: 'invalid' } } })],
+  ])('omits %s rows', (_name, input) => {
+    expect(summarizeTerminalBench([input])).toEqual(new Map());
+  });
+});
+
+describe('terminalBenchFor', () => {
+  const summaries = new Map([['openai/model', summary]]);
+
+  it('matches exact OpenRouter IDs', () => {
+    expect(terminalBenchFor(summaries, 'openai/model')).toEqual(summary);
+  });
+
+  it('matches safely prefixed Kilo gateway IDs', () => {
+    expect(terminalBenchFor(summaries, 'kilo/openai/model')).toEqual(summary);
+  });
+
+  it('does not strip ambiguous Kilo-owned IDs', () => {
+    expect(terminalBenchFor(summaries, 'kilo/special-model')).toBeUndefined();
+  });
+});
+
+describe('createTerminalBenchFetch', () => {
+  it('falls back to an empty map when the first lookup fails', async () => {
+    jest.spyOn(console, 'error').mockImplementation(() => undefined);
+    const load = jest
+      .fn<() => Promise<TerminalBenchSummaries>>()
+      .mockRejectedValue(new Error('lookup failed'));
+    const get = createTerminalBenchFetch(load);
+
+    expect(await get()).toEqual(new Map());
+    expect(console.error).toHaveBeenCalled();
+  });
+
+  it('falls back to the last-known-good map when refresh fails', async () => {
+    jest.spyOn(console, 'error').mockImplementation(() => undefined);
+    const good = new Map([['openai/model', summary]]);
+    const load = jest
+      .fn<() => Promise<TerminalBenchSummaries>>()
+      .mockResolvedValueOnce(good)
+      .mockRejectedValueOnce(new Error('lookup failed'));
+    const get = createTerminalBenchFetch(load);
+
+    expect(await get()).toBe(good);
+    expect(await get()).toBe(good);
+    expect(console.error).toHaveBeenCalled();
+  });
+});
diff --git a/apps/web/src/lib/model-stats/terminal-bench.ts b/apps/web/src/lib/model-stats/terminal-bench.ts
new file mode 100644
index 0000000000..ed972d9b75
--- /dev/null
+++ b/apps/web/src/lib/model-stats/terminal-bench.ts
@@ -0,0 +1,83 @@
+import { createCachedFetch } from '@/lib/cached-fetch';
+import { readDb } from '@/lib/drizzle';
+import { ModelStatsBenchmarksSchema, modelStats } from '@kilocode/db/schema';
+import { unprefixKiloGatewayModelId } from '@kilocode/worker-utils/kilo-model-id';
+import { and, eq } from 'drizzle-orm';
+
+const TTL = process.env.NODE_ENV === 'test' ? 0 : 5 * 60 * 1000;
+
+export type TerminalBenchSummary = {
+  overallScore: number;
+  avgAttemptCostUsd: number;
+};
+
+export type TerminalBenchSummaries = ReadonlyMap<string, TerminalBenchSummary>;
+
+type Row = {
+  openrouterId: string;
+  isActive: boolean | null;
+  isStealth: boolean;
+  benchmarks: unknown;
+};
+
+export function summarizeTerminalBench(rows: readonly Row[]): TerminalBenchSummaries {
+  const summaries = new Map<string, TerminalBenchSummary>();
+
+  for (const row of rows) {
+    if (!row.isActive || row.isStealth) continue;
+    const result = ModelStatsBenchmarksSchema.safeParse(row.benchmarks);
+    if (!result.success) continue;
+    const bench = result.data?.kiloBench?.evals['terminal-bench'];
+    if (
+      !bench ||
+      (bench.nAttempts ?? 0) < 5 ||
+      bench.avgAttemptCostUsd === null ||
+      bench.avgAttemptCostUsd === undefined
+    ) {
+      continue;
+    }
+    summaries.set(row.openrouterId, {
+      overallScore: bench.overallScore,
+      avgAttemptCostUsd: bench.avgAttemptCostUsd,
+    });
+  }
+
+  return summaries;
+}
+
+export function terminalBenchFor(
+  summaries: TerminalBenchSummaries,
+  id: string
+): TerminalBenchSummary | undefined {
+  const exact = summaries.get(id);
+  if (exact) return exact;
+  const unprefixed = unprefixKiloGatewayModelId(id);
+  return unprefixed ? summaries.get(unprefixed) : undefined;
+}
+
+async function loadTerminalBench(): Promise<TerminalBenchSummaries> {
+  const rows = await readDb
+    .select({
+      openrouterId: modelStats.openrouterId,
+      isActive: modelStats.isActive,
+      isStealth: modelStats.isStealth,
+      benchmarks: modelStats.benchmarks,
+    })
+    .from(modelStats)
+    .where(and(eq(modelStats.isActive, true), eq(modelStats.isStealth, false)));
+  return summarizeTerminalBench(rows);
+}
+
+export function createTerminalBenchFetch(load = loadTerminalBench) {
+  return createCachedFetch(
+    () =>
+      load().catch(err => {
+        console.error('[terminal-bench] Failed to load model summaries:', err);
+        throw err;
+      }),
+    TTL,
+    new Map<string, TerminalBenchSummary>()
+  );
+}
+
+export const getTerminalBenchSummaries = createTerminalBenchFetch();
diff --git a/apps/web/src/lib/organizations/organization-types.ts b/apps/web/src/lib/organizations/organization-types.ts
index 0dd02aa5bc..84d52413cc 100644
--- a/apps/web/src/lib/organizations/organization-types.ts
+++ b/apps/web/src/lib/organizations/organization-types.ts
@@ -199,6 +199,12 @@ const OpenRouterModelSchema = z.object({
   // kilocode additions:
   preferredIndex: z.number().optional(),
   isFree: z.boolean().optional(),
+  terminalBench: z
+    .object({
+      overallScore: z.number(),
+      avgAttemptCostUsd: z.number(),
+    })
+    .optional(),
   opencode: OpenCodeSettingsSchema.optional(),
 
   id: z.string(),
diff --git a/apps/web/src/tests/openrouter-models.test.ts b/apps/web/src/tests/openrouter-models.test.ts
index 044e8d5857..f77333fcd8 100644
--- a/apps/web/src/tests/openrouter-models.test.ts
+++ b/apps/web/src/tests/openrouter-models.test.ts
@@ -1,15 +1,22 @@
-import { test, expect, describe, afterEach, jest, beforeEach } from '@jest/globals';
+import { test, expect, describe, afterEach, beforeEach } from '@jest/globals';
 import { mockOpenRouterModels, createMockResponse } from './helpers/openrouter-models.helper';
 import { GET } from '../app/api/openrouter/models/route';
 import { NextRequest } from 'next/server';
 
 jest.mock('@/lib/user/server', () => ({
-  getUserByAuthorizationHeader: jest.fn().mockImplementation(async () => ({
+  getUserFromAuth: jest.fn(async () => ({
     user: { id: 'test-user-id' },
-    authFailedResponse: null,
+    organizationId: null,
   })),
 }));
 
+jest.mock('@/lib/model-stats/terminal-bench', () => ({
+  getTerminalBenchSummaries: jest.fn(
+    async () => new Map([['some-other-model', { overallScore: 0.551, avgAttemptCostUsd: 53.37 }]])
+  ),
+  terminalBenchFor: jest.fn((summaries: Map<string, unknown>, id: string) => summaries.get(id)),
+}));
+
 function createTestRequest(path: string) {
   return new NextRequest(new URL(path, 'http://localhost:3000'), {
     method: 'GET',
@@ -18,8 +25,7 @@ function createTestRequest(path: string) {
 
 describe('GET /api/openrouter/models', () => {
   beforeEach(() => {
-    // Reset all mocks before each test
-    jest.resetAllMocks();
+    jest.clearAllMocks();
   });
 
   test('should handle OpenRouter API errors', async () => {
@@ -86,6 +92,28 @@ describe('GET /api/openrouter/models', () => {
     expect(responseData.data).toBeDefined();
     expect(Array.isArray(responseData.data)).toBe(true);
   });
+
+  test('should include publishable Terminal Bench summaries for canonical models', async () => {
+    const request = createTestRequest('/api/openrouter/models');
+
+    global.fetch = jest.fn(() => {
+      return Promise.resolve(
+        createMockResponse({
+          ok: true,
+          status: 200,
+          statusText: 'OK',
+          jsonData: mockOpenRouterModels,
+        })
+      );
+    }) as unknown as typeof fetch;
+
+    const response = await GET(request);
+    const responseData = await response.json();
+    const model = responseData.data.find((item: { id: string }) => item.id === 'some-other-model');
+
+    expect(response.status).toBe(200);
+    expect(model.terminalBench).toEqual({ overallScore: 0.551, avgAttemptCostUsd: 53.37 });
+  });
 });
 
 afterEach(() => {

From 22589e817d0bc94d4f53f1e8e40192d014b31ca6 Mon Sep 17 00:00:00 2001
From: Josh Lambert <josh@kilocode.ai>
Date: Fri, 5 Jun 2026 16:41:49 -0400
Subject: [PATCH 2/2] test(ai-gateway): focus Terminal Bench coverage

---
 .../lib/model-stats/terminal-bench.test.ts    | 70 ++++---------------
 .../web/src/lib/model-stats/terminal-bench.ts |  2 +-
 2 files changed, 16 insertions(+), 56 deletions(-)

diff --git a/apps/web/src/lib/model-stats/terminal-bench.test.ts b/apps/web/src/lib/model-stats/terminal-bench.test.ts
index 43a91a36f8..26d211e04a 100644
--- a/apps/web/src/lib/model-stats/terminal-bench.test.ts
+++ b/apps/web/src/lib/model-stats/terminal-bench.test.ts
@@ -1,10 +1,5 @@
-import { describe, expect, it, jest } from '@jest/globals';
-import {
-  createTerminalBenchFetch,
-  summarizeTerminalBench,
-  terminalBenchFor,
-  type TerminalBenchSummaries,
-} from './terminal-bench';
+import { describe, expect, it } from '@jest/globals';
+import { summarizeTerminalBench, terminalBenchFor } from './terminal-bench';
 
 const summary = { overallScore: 0.551, avgAttemptCostUsd: 53.37 };
 
@@ -50,61 +45,26 @@ function row(overrides: Partial<Parameters<typeof summarizeTerminalBench>[0][num
 }
 
 describe('summarizeTerminalBench', () => {
-  it('extracts publishable summaries keyed by OpenRouter ID', () => {
-    expect(summarizeTerminalBench([row()])).toEqual(new Map([['openai/model', summary]]));
-  });
+  it('publishes only eligible summaries', () => {
+    const summaries = summarizeTerminalBench([
+      row(),
+      row({ isActive: false }),
+      row({ isStealth: true }),
+      row({ benchmarks: benchmarks({ nAttempts: 4 }) }),
+      row({ benchmarks: benchmarks({ avgAttemptCostUsd: null }) }),
+      row({ benchmarks: { kiloBench: { overallScore: 0.4, evals: {} } } }),
+      row({ benchmarks: { kiloBench: { overallScore: 'invalid' } } }),
+    ]);
 
-  it.each([
-    ['inactive', row({ isActive: false })],
-    ['stealth', row({ isStealth: true })],
-    ['fewer than five attempts', row({ benchmarks: benchmarks({ nAttempts: 4 }) })],
-    ['null attempt cost', row({ benchmarks: benchmarks({ avgAttemptCostUsd: null }) })],
-    ['missing eval', row({ benchmarks: { kiloBench: { overallScore: 0.4, evals: {} } } })],
-    ['invalid benchmarks', row({ benchmarks: { kiloBench: { overallScore: 'invalid' } } })],
-  ])('omits %s rows', (_name, input) => {
-    expect(summarizeTerminalBench([input])).toEqual(new Map());
+    expect(summaries).toEqual(new Map([['openai/model', summary]]));
   });
 });
 
 describe('terminalBenchFor', () => {
-  const summaries = new Map([['openai/model', summary]]);
-
-  it('matches exact OpenRouter IDs', () => {
-    expect(terminalBenchFor(summaries, 'openai/model')).toEqual(summary);
-  });
+  it('matches only safe canonical IDs', () => {
+    const summaries = new Map([['openai/model', summary]]);
 
-  it('matches safely prefixed Kilo gateway IDs', () => {
     expect(terminalBenchFor(summaries, 'kilo/openai/model')).toEqual(summary);
-  });
-
-  it('does not strip ambiguous Kilo-owned IDs', () => {
     expect(terminalBenchFor(summaries, 'kilo/special-model')).toBeUndefined();
   });
 });
-
-describe('createTerminalBenchFetch', () => {
-  it('falls back to an empty map when the first lookup fails', async () => {
-    jest.spyOn(console, 'error').mockImplementation(() => undefined);
-    const load = jest
-      .fn<() => Promise<TerminalBenchSummaries>>()
-      .mockRejectedValue(new Error('lookup failed'));
-    const get = createTerminalBenchFetch(load);
-
-    expect(await get()).toEqual(new Map());
-    expect(console.error).toHaveBeenCalled();
-  });
-
-  it('falls back to the last-known-good map when refresh fails', async () => {
-    jest.spyOn(console, 'error').mockImplementation(() => undefined);
-    const good = new Map([['openai/model', summary]]);
-    const load = jest
-      .fn<() => Promise<TerminalBenchSummaries>>()
-      .mockResolvedValueOnce(good)
-      .mockRejectedValueOnce(new Error('lookup failed'));
-    const get = createTerminalBenchFetch(load);
-
-    expect(await get()).toBe(good);
-    expect(await get()).toBe(good);
-    expect(console.error).toHaveBeenCalled();
-  });
-});
diff --git a/apps/web/src/lib/model-stats/terminal-bench.ts b/apps/web/src/lib/model-stats/terminal-bench.ts
index ed972d9b75..eb692e6452 100644
--- a/apps/web/src/lib/model-stats/terminal-bench.ts
+++ b/apps/web/src/lib/model-stats/terminal-bench.ts
@@ -68,7 +68,7 @@ async function loadTerminalBench(): Promise<TerminalBenchSummaries> {
   return summarizeTerminalBench(rows);
 }
 
-export function createTerminalBenchFetch(load = loadTerminalBench) {
+function createTerminalBenchFetch(load = loadTerminalBench) {
   return createCachedFetch(
     () =>
       load().catch(err => {