diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 3089a5c6..b60b1c2d 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -95,6 +95,7 @@ export interface IndexArtifactEntry { readonly timestamp: string; readonly test_id: string; readonly dataset?: string; + readonly category?: string; readonly conversation_id?: string; readonly score: number; readonly target: string; @@ -508,6 +509,7 @@ export function buildIndexArtifactEntry( timestamp: result.timestamp, test_id: result.testId ?? 'unknown', dataset: getDataset(result), + category: result.category, conversation_id: result.conversationId, score: result.score, target: result.target ?? 'unknown', @@ -539,6 +541,7 @@ export function buildResultIndexArtifact(result: EvaluationResult): ResultIndexA timestamp: result.timestamp, test_id: result.testId ?? 'unknown', dataset: getDataset(result), + category: result.category, conversation_id: result.conversationId, score: result.score, target: result.target ?? 'unknown', diff --git a/apps/cli/src/commands/eval/discover.ts b/apps/cli/src/commands/eval/discover.ts index 7ac77231..f8ea59e1 100644 --- a/apps/cli/src/commands/eval/discover.ts +++ b/apps/cli/src/commands/eval/discover.ts @@ -1,5 +1,5 @@ import path from 'node:path'; -import { DEFAULT_EVAL_PATTERNS, loadConfig } from '@agentv/core'; +import { DEFAULT_EVAL_PATTERNS, deriveCategory, loadConfig } from '@agentv/core'; import fg from 'fast-glob'; import { findRepoRoot } from './shared.js'; @@ -52,21 +52,6 @@ export async function discoverEvalFiles(cwd: string): Promise d !== 'evals'); - return dirs.length > 0 ? dirs.join('/') : 'root'; -} - /** Get unique categories from discovered eval files. */ export function getCategories(files: readonly DiscoveredEvalFile[]): readonly string[] { const categories = new Set(); diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index a0eaad40..32d3318f 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -14,6 +14,7 @@ import { ResponseCache, type TrialsConfig, runEvaluation as defaultRunEvaluation, + deriveCategory, ensureVSCodeSubagents, loadConfig, loadTestSuite, @@ -444,9 +445,13 @@ async function prepareFileMetadata(params: { verbose: options.verbose, }); + const relativePath = path.relative(cwd, testFilePath); + const category = deriveCategory(relativePath); + const suite = await loadTestSuite(testFilePath, repoRoot, { verbose: options.verbose, filter: options.filter, + category, }); const filteredIds = suite.tests.map((value) => value.id); diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index 26da5826..ef53fe5e 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -20,10 +20,10 @@ */ import { readFile } from 'node:fs/promises'; import { mkdir, writeFile } from 'node:fs/promises'; -import { dirname, join, resolve } from 'node:path'; +import { dirname, join, relative, resolve } from 'node:path'; import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core'; -import { loadTestSuite } from '@agentv/core'; +import { deriveCategory, loadTestSuite } from '@agentv/core'; import { command, option, optional, positional, string } from 'cmd-ts'; import { buildDefaultRunDir } from '../eval/result-layout.js'; @@ -57,7 +57,8 @@ export const evalInputCommand = command({ const repoRoot = await findRepoRoot(dirname(resolvedEvalPath)); const evalDir = dirname(resolvedEvalPath); - const suite = await loadTestSuite(resolvedEvalPath, repoRoot); + const category = deriveCategory(relative(process.cwd(), resolvedEvalPath)); + const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); const tests = suite.tests; if (tests.length === 0) { diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 09033635..d2c18811 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -15,9 +15,9 @@ import { execSync } from 'node:child_process'; import { existsSync, readFileSync, unlinkSync } from 'node:fs'; import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; -import { dirname, join, resolve } from 'node:path'; +import { dirname, join, relative, resolve } from 'node:path'; -import { executeScript, loadTestSuite } from '@agentv/core'; +import { deriveCategory, executeScript, loadTestSuite } from '@agentv/core'; import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core'; import { command, number, option, optional, positional, string } from 'cmd-ts'; @@ -91,7 +91,8 @@ export const evalRunCommand = command({ const evalDir = dirname(resolvedEvalPath); // ── Step 1: Extract inputs (same as pipeline input) ────────────── - const suite = await loadTestSuite(resolvedEvalPath, repoRoot); + const category = deriveCategory(relative(process.cwd(), resolvedEvalPath)); + const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); const tests = suite.tests; if (tests.length === 0) { diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index fe642d36..7a4e3d72 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -12,6 +12,7 @@ export interface ResultManifestRecord { readonly test_id?: string; readonly eval_id?: string; readonly dataset?: string; + readonly category?: string; readonly experiment?: string; readonly target?: string; readonly score: number; @@ -125,6 +126,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E timestamp: record.timestamp, testId, dataset: record.dataset, + category: record.category, target: record.target, score: record.score, executionStatus: record.execution_status, diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index a178359d..6253206e 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -31,7 +31,7 @@ import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { command, number, option, optional, positional, string } from 'cmd-ts'; -import type { EvaluationResult } from '@agentv/core'; +import { DEFAULT_CATEGORY, type EvaluationResult } from '@agentv/core'; import { Hono } from 'hono'; import { parseJsonlResults } from '../eval/artifact-writer.js'; @@ -304,6 +304,82 @@ export function createApp( } }); + // Category summaries for a run + app.get('/api/runs/:filename/categories', (c) => { + const filename = c.req.param('filename'); + const metas = listResultFiles(searchDir); + const meta = metas.find((m) => m.filename === filename); + if (!meta) { + return c.json({ error: 'Run not found' }, 404); + } + try { + const loaded = patchTestIds(loadManifestResults(meta.path)); + const categoryMap = new Map< + string, + { total: number; passed: number; scoreSum: number; datasets: Set } + >(); + for (const r of loaded) { + const cat = r.category ?? DEFAULT_CATEGORY; + const entry = categoryMap.get(cat) ?? { + total: 0, + passed: 0, + scoreSum: 0, + datasets: new Set(), + }; + entry.total++; + if (r.score >= 1) entry.passed++; + entry.scoreSum += r.score; + entry.datasets.add(r.dataset ?? r.target ?? 'default'); + categoryMap.set(cat, entry); + } + const categories = [...categoryMap.entries()].map(([name, entry]) => ({ + name, + total: entry.total, + passed: entry.passed, + failed: entry.total - entry.passed, + avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0, + dataset_count: entry.datasets.size, + })); + return c.json({ categories }); + } catch { + return c.json({ error: 'Failed to load categories' }, 500); + } + }); + + // Datasets within a category for a run + app.get('/api/runs/:filename/categories/:category/datasets', (c) => { + const filename = c.req.param('filename'); + const category = decodeURIComponent(c.req.param('category')); + const metas = listResultFiles(searchDir); + const meta = metas.find((m) => m.filename === filename); + if (!meta) { + return c.json({ error: 'Run not found' }, 404); + } + try { + const loaded = patchTestIds(loadManifestResults(meta.path)); + const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category); + const datasetMap = new Map(); + for (const r of filtered) { + const ds = r.dataset ?? r.target ?? 'default'; + const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; + entry.total++; + if (r.score >= 1) entry.passed++; + entry.scoreSum += r.score; + datasetMap.set(ds, entry); + } + const datasets = [...datasetMap.entries()].map(([name, entry]) => ({ + name, + total: entry.total, + passed: entry.passed, + failed: entry.total - entry.passed, + avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0, + })); + return c.json({ datasets }); + } catch { + return c.json({ error: 'Failed to load datasets' }, 500); + } + }); + // Full eval detail with hydrated artifacts app.get('/api/runs/:filename/evals/:evalId', (c) => { const filename = c.req.param('filename'); diff --git a/apps/studio/src/components/Breadcrumbs.tsx b/apps/studio/src/components/Breadcrumbs.tsx index 48d5f7c3..602abc37 100644 --- a/apps/studio/src/components/Breadcrumbs.tsx +++ b/apps/studio/src/components/Breadcrumbs.tsx @@ -23,7 +23,18 @@ function deriveSegments(matches: ReturnType): BreadcrumbSegme if (routeId === '/' || routeId === '/_layout') continue; - if (routeId.includes('/runs/$runId/dataset/$dataset')) { + if (routeId.includes('/runs/$runId/category/$category')) { + if (!segments.some((s) => s.label === params.runId)) { + segments.push({ + label: params.runId ?? 'Run', + to: `/runs/${encodeURIComponent(params.runId)}`, + }); + } + segments.push({ + label: params.category ?? 'Category', + to: match.pathname, + }); + } else if (routeId.includes('/runs/$runId/dataset/$dataset')) { segments.push({ label: params.dataset ?? 'Dataset', to: match.pathname, diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 6c96c381..01961138 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -1,11 +1,12 @@ /** * Run detail component showing per-eval breakdown with score bars. * - * Displays each eval result as a row with test ID, target, score bar, - * status, duration, and cost. Clicking a row navigates to eval detail. + * Groups results by category (from file path), then by dataset within each category. + * Categories are shown as collapsible sections with dataset cards inside. */ import { Link } from '@tanstack/react-router'; +import { useState } from 'react'; import type { EvalResult } from '~/lib/types'; @@ -17,34 +18,79 @@ interface RunDetailProps { runId: string; } -export function RunDetail({ results, runId }: RunDetailProps) { - const total = results.length; - const passed = results.filter((r) => r.score >= 1).length; - const failed = total - passed; - const passRate = total > 0 ? passed / total : 0; - const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); +interface DatasetStats { + name: string; + passed: number; + failed: number; + total: number; + avgScore: number; +} + +interface CategoryGroup { + name: string; + datasets: DatasetStats[]; + total: number; + passed: number; + failed: number; + avgScore: number; +} - // Dataset breakdown: group by dataset - const datasetMap = new Map< +function buildCategoryGroups(results: EvalResult[]): CategoryGroup[] { + const categoryMap = new Map< string, - { passed: number; failed: number; total: number; scoreSum: number } + Map >(); + for (const r of results) { + const cat = r.category ?? 'Uncategorized'; const ds = r.dataset ?? 'Uncategorized'; - const entry = datasetMap.get(ds) ?? { passed: 0, failed: 0, total: 0, scoreSum: 0 }; + if (!categoryMap.has(cat)) categoryMap.set(cat, new Map()); + // biome-ignore lint/style/noNonNullAssertion: map entry guaranteed by line above + const dsMap = categoryMap.get(cat)!; + const entry = dsMap.get(ds) ?? { passed: 0, failed: 0, total: 0, scoreSum: 0 }; entry.total += 1; entry.scoreSum += r.score; if (r.score >= 1) entry.passed += 1; else entry.failed += 1; - datasetMap.set(ds, entry); + dsMap.set(ds, entry); } - const datasets = Array.from(datasetMap.entries()) - .map(([name, stats]) => ({ - name, - ...stats, - avgScore: stats.total > 0 ? stats.scoreSum / stats.total : 0, - })) + + return Array.from(categoryMap.entries()) + .map(([catName, dsMap]) => { + const datasets = Array.from(dsMap.entries()) + .map(([dsName, stats]) => ({ + name: dsName, + ...stats, + avgScore: stats.total > 0 ? stats.scoreSum / stats.total : 0, + })) + .sort((a, b) => a.name.localeCompare(b.name)); + + const total = datasets.reduce((s, d) => s + d.total, 0); + const passed = datasets.reduce((s, d) => s + d.passed, 0); + const failed = datasets.reduce((s, d) => s + d.failed, 0); + const scoreSum = datasets.reduce((s, d) => s + d.avgScore * d.total, 0); + + return { + name: catName, + datasets, + total, + passed, + failed, + avgScore: total > 0 ? scoreSum / total : 0, + }; + }) .sort((a, b) => a.name.localeCompare(b.name)); +} + +export function RunDetail({ results, runId }: RunDetailProps) { + const total = results.length; + const passed = results.filter((r) => r.score >= 1).length; + const failed = total - passed; + const passRate = total > 0 ? passed / total : 0; + const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); + + const categories = buildCategoryGroups(results); + const hasMultipleCategories = categories.length > 1; if (total === 0) { return ( @@ -68,32 +114,19 @@ export function RunDetail({ results, runId }: RunDetailProps) { totalCost={totalCost > 0 ? totalCost : undefined} /> - {/* Dataset breakdown */} - {datasets.length >= 1 && ( + {hasMultipleCategories ? ( +
+

Categories

+ {categories.map((cat) => ( + + ))} +
+ ) : (

Datasets

- {datasets.map((cat) => ( - -
- {cat.name} - - {cat.passed}/{cat.total} - -
-
- -
-
- {cat.passed} passed - {cat.failed > 0 && {cat.failed} failed} -
- + {categories[0]?.datasets.map((ds) => ( + ))}
@@ -148,6 +181,68 @@ export function RunDetail({ results, runId }: RunDetailProps) { ); } +function CategorySection({ category, runId }: { category: CategoryGroup; runId: string }) { + const [expanded, setExpanded] = useState(true); + + return ( +
+ + {expanded && ( +
+
+ {category.datasets.map((ds) => ( + + ))} +
+
+ )} +
+ ); +} + +function DatasetCard({ dataset, runId }: { dataset: DatasetStats; runId: string }) { + return ( + +
+ {dataset.name} + + {dataset.passed}/{dataset.total} + +
+
+ +
+
+ {dataset.passed} passed + {dataset.failed > 0 && {dataset.failed} failed} +
+ + ); +} + function StatusBadge({ status }: { status?: string }) { if (!status) return -; diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 68da8651..3ca1d43e 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -10,11 +10,15 @@ import { Link, useMatchRoute } from '@tanstack/react-router'; -import { useExperiments, useRunDetail, useRunList } from '~/lib/api'; +import { useCategoryDatasets, useExperiments, useRunDetail, useRunList } from '~/lib/api'; export function Sidebar() { const matchRoute = useMatchRoute(); const evalMatch = matchRoute({ to: '/evals/$runId/$evalId', fuzzy: true }); + const categoryMatch = matchRoute({ + to: '/runs/$runId/category/$category', + fuzzy: true, + }); const datasetMatch = matchRoute({ to: '/runs/$runId/dataset/$dataset', fuzzy: true, @@ -24,6 +28,12 @@ export function Sidebar() { fuzzy: true, }); + // If on a category detail page, show the category sidebar + if (categoryMatch && typeof categoryMatch === 'object' && 'runId' in categoryMatch) { + const { runId, category } = categoryMatch as { runId: string; category: string }; + return ; + } + // If on a dataset detail page, show evals filtered to that dataset if (datasetMatch && typeof datasetMatch === 'object' && 'runId' in datasetMatch) { const { runId, dataset } = datasetMatch as { runId: string; dataset: string }; @@ -206,6 +216,55 @@ function DatasetSidebar({ runId, dataset }: { runId: string; dataset: string }) ); } +function CategorySidebar({ runId, category }: { runId: string; category: string }) { + const { data } = useCategoryDatasets(runId, category); + const datasets = data?.datasets ?? []; + + return ( + + ); +} + function ExperimentSidebar({ currentExperiment }: { currentExperiment: string }) { const { data } = useExperiments(); const experiments = data?.experiments ?? []; diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index cc3aa022..a1d6c7f0 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -8,6 +8,7 @@ import { queryOptions, useQuery } from '@tanstack/react-query'; import type { + CategoriesResponse, DatasetsResponse, EvalDetailResponse, ExperimentsResponse, @@ -105,6 +106,26 @@ export function evalFileContentOptions(runId: string, evalId: string, filePath: }); } +export function runCategoriesOptions(runId: string) { + return queryOptions({ + queryKey: ['runs', runId, 'categories'], + queryFn: () => + fetchJson(`/api/runs/${encodeURIComponent(runId)}/categories`), + enabled: !!runId, + }); +} + +export function categoryDatasetsOptions(runId: string, category: string) { + return queryOptions({ + queryKey: ['runs', runId, 'categories', category, 'datasets'], + queryFn: () => + fetchJson( + `/api/runs/${encodeURIComponent(runId)}/categories/${encodeURIComponent(category)}/datasets`, + ), + enabled: !!runId && !!category, + }); +} + // ── Hooks ─────────────────────────────────────────────────────────────── export function useRunList() { @@ -146,3 +167,11 @@ export function useEvalFiles(runId: string, evalId: string) { export function useEvalFileContent(runId: string, evalId: string, filePath: string) { return useQuery(evalFileContentOptions(runId, evalId, filePath)); } + +export function useRunCategories(runId: string) { + return useQuery(runCategoriesOptions(runId)); +} + +export function useCategoryDatasets(runId: string, category: string) { + return useQuery(categoryDatasetsOptions(runId, category)); +} diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index e6454008..4c1ef6f0 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -49,6 +49,7 @@ export interface EvalResult { testId: string; timestamp?: string; dataset?: string; + category?: string; target?: string; experiment?: string; score: number; @@ -150,3 +151,16 @@ export interface FileContentResponse { content: string; language: string; } + +export interface CategorySummary { + name: string; + total: number; + passed: number; + failed: number; + avg_score: number; + dataset_count: number; +} + +export interface CategoriesResponse { + categories: CategorySummary[]; +} diff --git a/apps/studio/src/routeTree.gen.ts b/apps/studio/src/routeTree.gen.ts index 971d6546..118360b3 100644 --- a/apps/studio/src/routeTree.gen.ts +++ b/apps/studio/src/routeTree.gen.ts @@ -14,6 +14,7 @@ import { Route as RunsRunIdRouteImport } from './routes/runs/$runId' import { Route as ExperimentsExperimentNameRouteImport } from './routes/experiments/$experimentName' import { Route as EvalsRunIdEvalIdRouteImport } from './routes/evals/$runId.$evalId' import { Route as RunsRunIdDatasetDatasetRouteImport } from './routes/runs/$runId_.dataset.$dataset' +import { Route as RunsRunIdCategoryCategoryRouteImport } from './routes/runs/$runId_.category.$category' const IndexRoute = IndexRouteImport.update({ id: '/', @@ -41,12 +42,19 @@ const RunsRunIdDatasetDatasetRoute = RunsRunIdDatasetDatasetRouteImport.update({ path: '/runs/$runId/dataset/$dataset', getParentRoute: () => rootRouteImport, } as any) +const RunsRunIdCategoryCategoryRoute = + RunsRunIdCategoryCategoryRouteImport.update({ + id: '/runs/$runId_/category/$category', + path: '/runs/$runId/category/$category', + getParentRoute: () => rootRouteImport, + } as any) export interface FileRoutesByFullPath { '/': typeof IndexRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute + '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute } export interface FileRoutesByTo { @@ -54,6 +62,7 @@ export interface FileRoutesByTo { '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute + '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute } export interface FileRoutesById { @@ -62,6 +71,7 @@ export interface FileRoutesById { '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute + '/runs/$runId_/category/$category': typeof RunsRunIdCategoryCategoryRoute '/runs/$runId_/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute } export interface FileRouteTypes { @@ -71,6 +81,7 @@ export interface FileRouteTypes { | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' + | '/runs/$runId/category/$category' | '/runs/$runId/dataset/$dataset' fileRoutesByTo: FileRoutesByTo to: @@ -78,6 +89,7 @@ export interface FileRouteTypes { | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' + | '/runs/$runId/category/$category' | '/runs/$runId/dataset/$dataset' id: | '__root__' @@ -85,6 +97,7 @@ export interface FileRouteTypes { | '/experiments/$experimentName' | '/runs/$runId' | '/evals/$runId/$evalId' + | '/runs/$runId_/category/$category' | '/runs/$runId_/dataset/$dataset' fileRoutesById: FileRoutesById } @@ -93,6 +106,7 @@ export interface RootRouteChildren { ExperimentsExperimentNameRoute: typeof ExperimentsExperimentNameRoute RunsRunIdRoute: typeof RunsRunIdRoute EvalsRunIdEvalIdRoute: typeof EvalsRunIdEvalIdRoute + RunsRunIdCategoryCategoryRoute: typeof RunsRunIdCategoryCategoryRoute RunsRunIdDatasetDatasetRoute: typeof RunsRunIdDatasetDatasetRoute } @@ -133,6 +147,13 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof RunsRunIdDatasetDatasetRouteImport parentRoute: typeof rootRouteImport } + '/runs/$runId_/category/$category': { + id: '/runs/$runId_/category/$category' + path: '/runs/$runId/category/$category' + fullPath: '/runs/$runId/category/$category' + preLoaderRoute: typeof RunsRunIdCategoryCategoryRouteImport + parentRoute: typeof rootRouteImport + } } } @@ -141,6 +162,7 @@ const rootRouteChildren: RootRouteChildren = { ExperimentsExperimentNameRoute: ExperimentsExperimentNameRoute, RunsRunIdRoute: RunsRunIdRoute, EvalsRunIdEvalIdRoute: EvalsRunIdEvalIdRoute, + RunsRunIdCategoryCategoryRoute: RunsRunIdCategoryCategoryRoute, RunsRunIdDatasetDatasetRoute: RunsRunIdDatasetDatasetRoute, } export const routeTree = rootRouteImport diff --git a/apps/studio/src/routes/runs/$runId_.category.$category.tsx b/apps/studio/src/routes/runs/$runId_.category.$category.tsx new file mode 100644 index 00000000..813b5b6c --- /dev/null +++ b/apps/studio/src/routes/runs/$runId_.category.$category.tsx @@ -0,0 +1,94 @@ +/** + * Category drill-down route: shows datasets filtered to a single category. + * + * Uses the `$runId_` trailing-underscore convention so that + * `/runs/:runId/category/:category` is a sibling of `/runs/:runId`, + * not a child route. + */ + +import { Link, createFileRoute } from '@tanstack/react-router'; + +import { ScoreBar } from '~/components/ScoreBar'; +import { StatsCards } from '~/components/StatsCards'; +import { useCategoryDatasets } from '~/lib/api'; + +export const Route = createFileRoute('/runs/$runId_/category/$category')({ + component: CategoryPage, +}); + +function CategoryPage() { + const { runId, category } = Route.useParams(); + const { data, isLoading, error } = useCategoryDatasets(runId, category); + + if (isLoading) { + return ( +
+
+
+ {['s1', 's2', 's3', 's4', 's5'].map((id) => ( +
+ ))} +
+
+ ); + } + + if (error) { + return ( +
+ Failed to load category: {error.message} +
+ ); + } + + const datasets = data?.datasets ?? []; + const total = datasets.reduce((s, d) => s + d.total, 0); + const passed = datasets.reduce((s, d) => s + d.passed, 0); + const failed = total - passed; + const passRate = total > 0 ? passed / total : 0; + + return ( +
+
+

{category}

+

Category in run: {runId}

+
+ + + + {datasets.length === 0 ? ( +
+

No datasets in this category

+
+ ) : ( +
+

Datasets

+
+ {datasets.map((ds) => ( + +
+ {ds.name} + + {ds.passed}/{ds.total} + +
+
+ +
+
+ {ds.passed} passed + {ds.failed > 0 && {ds.failed} failed} +
+ + ))} +
+
+ )} +
+ ); +} diff --git a/packages/core/src/evaluation/category.ts b/packages/core/src/evaluation/category.ts new file mode 100644 index 00000000..926e34f8 --- /dev/null +++ b/packages/core/src/evaluation/category.ts @@ -0,0 +1,20 @@ +import path from 'node:path'; + +/** Default category for eval files without subdirectory structure. */ +export const DEFAULT_CATEGORY = 'Uncategorized'; + +/** + * Derive a human-readable category from an eval file's relative path. + * + * Strips the filename and any `evals` directory segments, then joins + * remaining directories with `/`. Returns {@link DEFAULT_CATEGORY} for files + * at the root level. + */ +export function deriveCategory(relativePath: string): string { + const parts = relativePath.split(path.sep); + if (parts.length <= 1) { + return DEFAULT_CATEGORY; + } + const dirs = parts.slice(0, -1).filter((d) => d !== 'evals'); + return dirs.length > 0 ? dirs.join('/') : DEFAULT_CATEGORY; +} diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index cdd40209..cd3303ee 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -822,6 +822,7 @@ export async function runEvaluation( timestamp: (now ?? (() => new Date()))().toISOString(), testId: evalCase.id, dataset: evalCase.dataset, + category: evalCase.category, score: 0, assertions: [], output: [], @@ -861,6 +862,7 @@ export async function runEvaluation( timestamp: (now ?? (() => new Date()))().toISOString(), testId: evalCase.id, dataset: evalCase.dataset, + category: evalCase.category, score: 0, assertions: [], output: [], @@ -2108,6 +2110,7 @@ async function evaluateCandidate(options: { timestamp: completedAt.toISOString(), testId: evalCase.id, dataset: evalCase.dataset, + category: evalCase.category, conversationId: evalCase.conversation_id, score: score.score, assertions: score.assertions, @@ -2590,6 +2593,7 @@ function buildErrorResult( timestamp: timestamp.toISOString(), testId: evalCase.id, dataset: evalCase.dataset, + category: evalCase.category, conversationId: evalCase.conversation_id, score: 0, assertions: [{ text: `Error: ${message}`, passed: false }], diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 8f5d718a..586cf5c2 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -772,6 +772,7 @@ export type EvaluatorConfig = export interface EvalTest { readonly id: string; readonly dataset?: string; + readonly category?: string; readonly conversation_id?: string; readonly question: string; readonly input: readonly TestMessage[]; @@ -894,6 +895,7 @@ export interface EvaluationResult { readonly timestamp: string; readonly testId: string; readonly dataset?: string; + readonly category?: string; readonly conversationId?: string; readonly score: number; readonly assertions: readonly AssertionEntry[]; diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index ed4d4b83..132baed8 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -77,6 +77,8 @@ type LoadOptions = { readonly verbose?: boolean; /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */ readonly filter?: string; + /** Category derived from the eval file's directory path */ + readonly category?: string; }; type RawTestSuite = JsonObject & { @@ -472,6 +474,7 @@ async function loadTestsFromYaml( const testCase: EvalTest = { id, dataset: evalSetName, + category: options?.category, conversation_id: conversationId, question: question, input: inputMessages, diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 79454572..0e457b4d 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -54,6 +54,7 @@ export { getWorkspacePoolRoot, } from './paths.js'; export { trimBaselineResult } from './evaluation/baseline.js'; +export { DEFAULT_CATEGORY, deriveCategory } from './evaluation/category.js'; export * from './observability/index.js'; // Registry exports diff --git a/packages/core/test/evaluation/category.test.ts b/packages/core/test/evaluation/category.test.ts new file mode 100644 index 00000000..9b8c62d0 --- /dev/null +++ b/packages/core/test/evaluation/category.test.ts @@ -0,0 +1,41 @@ +import { describe, expect, test } from 'bun:test'; + +import { DEFAULT_CATEGORY, deriveCategory } from '../../src/evaluation/category.js'; + +describe('deriveCategory', () => { + test('returns Uncategorized for single-segment path (root-level file)', () => { + expect(deriveCategory('dataset.eval.yaml')).toBe(DEFAULT_CATEGORY); + }); + + test('returns Uncategorized when only directory is evals', () => { + expect(deriveCategory('evals/dataset.eval.yaml')).toBe(DEFAULT_CATEGORY); + }); + + test('strips evals segment and returns remaining directory', () => { + expect(deriveCategory('evals/fundamentals/greetings.eval.yaml')).toBe('fundamentals'); + }); + + test('preserves nested directory paths', () => { + expect(deriveCategory('evals/cargowise-customs/layout-engine/eval.yaml')).toBe( + 'cargowise-customs/layout-engine', + ); + }); + + test('handles paths without evals segment', () => { + expect(deriveCategory('examples/showcase/eval.yaml')).toBe('examples/showcase'); + }); + + test('strips evals from middle of multi-level path', () => { + expect(deriveCategory('examples/showcase/export-screening/evals/dataset.eval.yaml')).toBe( + 'examples/showcase/export-screening', + ); + }); + + test('returns Uncategorized for empty string', () => { + expect(deriveCategory('')).toBe(DEFAULT_CATEGORY); + }); + + test('returns Uncategorized for just a filename with no directory', () => { + expect(deriveCategory('eval.yaml')).toBe(DEFAULT_CATEGORY); + }); +});