EntityProcess · christso · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -95,6 +95,7 @@ export interface IndexArtifactEntry {
   readonly timestamp: string;
   readonly test_id: string;
   readonly dataset?: string;
+  readonly category?: string;
   readonly conversation_id?: string;
   readonly score: number;
   readonly target: string;
@@ -508,6 +509,7 @@ export function buildIndexArtifactEntry(
     timestamp: result.timestamp,
     test_id: result.testId ?? 'unknown',
     dataset: getDataset(result),
+    category: result.category,
     conversation_id: result.conversationId,
     score: result.score,
     target: result.target ?? 'unknown',
@@ -539,6 +541,7 @@ export function buildResultIndexArtifact(result: EvaluationResult): ResultIndexA
     timestamp: result.timestamp,
     test_id: result.testId ?? 'unknown',
     dataset: getDataset(result),
+    category: result.category,
     conversation_id: result.conversationId,
     score: result.score,
     target: result.target ?? 'unknown',

diff --git a/apps/cli/src/commands/eval/discover.ts b/apps/cli/src/commands/eval/discover.ts
@@ -1,5 +1,5 @@
 import path from 'node:path';
-import { DEFAULT_EVAL_PATTERNS, loadConfig } from '@agentv/core';
+import { DEFAULT_EVAL_PATTERNS, deriveCategory, loadConfig } from '@agentv/core';
 import fg from 'fast-glob';
 
 import { findRepoRoot } from './shared.js';
@@ -52,21 +52,6 @@ export async function discoverEvalFiles(cwd: string): Promise<readonly Discovere
   return evalFiles;
 }
 
-/** Derive a human-readable category from the relative path. */
-function deriveCategory(relativePath: string): string {
-  const parts = relativePath.split(path.sep);
-  // Use the first meaningful directory as category
-  // e.g., "examples/showcase/export-screening/evals/dataset.eval.yaml" → "showcase/export-screening"
-  // e.g., "evals/dataset.eval.yaml" → "evals"
-  if (parts.length <= 1) {
-    return 'root';
-  }
-
-  // Remove the filename and "evals" folder if present
-  const dirs = parts.slice(0, -1).filter((d) => d !== 'evals');
-  return dirs.length > 0 ? dirs.join('/') : 'root';
-}
-
 /** Get unique categories from discovered eval files. */
 export function getCategories(files: readonly DiscoveredEvalFile[]): readonly string[] {
   const categories = new Set<string>();

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -14,6 +14,7 @@ import {
   ResponseCache,
   type TrialsConfig,
   runEvaluation as defaultRunEvaluation,
+  deriveCategory,
   ensureVSCodeSubagents,
   loadConfig,
   loadTestSuite,
@@ -444,9 +445,13 @@ async function prepareFileMetadata(params: {
     verbose: options.verbose,
   });
 
+  const relativePath = path.relative(cwd, testFilePath);
+  const category = deriveCategory(relativePath);
+
   const suite = await loadTestSuite(testFilePath, repoRoot, {
     verbose: options.verbose,
     filter: options.filter,
+    category,
   });
   const filteredIds = suite.tests.map((value) => value.id);
 

diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
@@ -20,10 +20,10 @@
  */
 import { readFile } from 'node:fs/promises';
 import { mkdir, writeFile } from 'node:fs/promises';
-import { dirname, join, resolve } from 'node:path';
+import { dirname, join, relative, resolve } from 'node:path';
 
 import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
-import { loadTestSuite } from '@agentv/core';
+import { deriveCategory, loadTestSuite } from '@agentv/core';
 import { command, option, optional, positional, string } from 'cmd-ts';
 
 import { buildDefaultRunDir } from '../eval/result-layout.js';
@@ -57,7 +57,8 @@ export const evalInputCommand = command({
     const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
     const evalDir = dirname(resolvedEvalPath);
 
-    const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
+    const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
+    const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
     const tests = suite.tests;
 
     if (tests.length === 0) {

diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
@@ -15,9 +15,9 @@ import { execSync } from 'node:child_process';
 import { existsSync, readFileSync, unlinkSync } from 'node:fs';
 import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
-import { dirname, join, resolve } from 'node:path';
+import { dirname, join, relative, resolve } from 'node:path';
 
-import { executeScript, loadTestSuite } from '@agentv/core';
+import { deriveCategory, executeScript, loadTestSuite } from '@agentv/core';
 import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
 import { command, number, option, optional, positional, string } from 'cmd-ts';
 
@@ -91,7 +91,8 @@ export const evalRunCommand = command({
     const evalDir = dirname(resolvedEvalPath);
 
     // ── Step 1: Extract inputs (same as pipeline input) ──────────────
-    const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
+    const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
+    const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
     const tests = suite.tests;
 
     if (tests.length === 0) {

diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
@@ -12,6 +12,7 @@ export interface ResultManifestRecord {
   readonly test_id?: string;
   readonly eval_id?: string;
   readonly dataset?: string;
+  readonly category?: string;
   readonly experiment?: string;
   readonly target?: string;
   readonly score: number;
@@ -125,6 +126,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E
     timestamp: record.timestamp,
     testId,
     dataset: record.dataset,
+    category: record.category,
     target: record.target,
     score: record.score,
     executionStatus: record.execution_status,

diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
@@ -31,7 +31,7 @@ import path from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { command, number, option, optional, positional, string } from 'cmd-ts';
 
-import type { EvaluationResult } from '@agentv/core';
+import { DEFAULT_CATEGORY, type EvaluationResult } from '@agentv/core';
 import { Hono } from 'hono';
 
 import { parseJsonlResults } from '../eval/artifact-writer.js';
@@ -304,6 +304,82 @@ export function createApp(
     }
   });
 
+  // Category summaries for a run
+  app.get('/api/runs/:filename/categories', (c) => {
+    const filename = c.req.param('filename');
+    const metas = listResultFiles(searchDir);
+    const meta = metas.find((m) => m.filename === filename);
+    if (!meta) {
+      return c.json({ error: 'Run not found' }, 404);
+    }
+    try {
+      const loaded = patchTestIds(loadManifestResults(meta.path));
+      const categoryMap = new Map<
+        string,
+        { total: number; passed: number; scoreSum: number; datasets: Set<string> }
+      >();
+      for (const r of loaded) {
+        const cat = r.category ?? DEFAULT_CATEGORY;
+        const entry = categoryMap.get(cat) ?? {
+          total: 0,
+          passed: 0,
+          scoreSum: 0,
+          datasets: new Set<string>(),
+        };
+        entry.total++;
+        if (r.score >= 1) entry.passed++;
+        entry.scoreSum += r.score;
+        entry.datasets.add(r.dataset ?? r.target ?? 'default');
+        categoryMap.set(cat, entry);
+      }
+      const categories = [...categoryMap.entries()].map(([name, entry]) => ({
+        name,
+        total: entry.total,
+        passed: entry.passed,
+        failed: entry.total - entry.passed,
+        avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
+        dataset_count: entry.datasets.size,
+      }));
+      return c.json({ categories });
+    } catch {
+      return c.json({ error: 'Failed to load categories' }, 500);
+    }
+  });
+
+  // Datasets within a category for a run
+  app.get('/api/runs/:filename/categories/:category/datasets', (c) => {
+    const filename = c.req.param('filename');
+    const category = decodeURIComponent(c.req.param('category'));
+    const metas = listResultFiles(searchDir);
+    const meta = metas.find((m) => m.filename === filename);
+    if (!meta) {
+      return c.json({ error: 'Run not found' }, 404);
+    }
+    try {
+      const loaded = patchTestIds(loadManifestResults(meta.path));
+      const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
+      const datasetMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
+      for (const r of filtered) {
+        const ds = r.dataset ?? r.target ?? 'default';
+        const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
+        entry.total++;
+        if (r.score >= 1) entry.passed++;
+        entry.scoreSum += r.score;
+        datasetMap.set(ds, entry);
+      }
+      const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
+        name,
+        total: entry.total,
+        passed: entry.passed,
+        failed: entry.total - entry.passed,
+        avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
+      }));
+      return c.json({ datasets });
+    } catch {
+      return c.json({ error: 'Failed to load datasets' }, 500);
+    }
+  });
+
   // Full eval detail with hydrated artifacts
   app.get('/api/runs/:filename/evals/:evalId', (c) => {
     const filename = c.req.param('filename');

diff --git a/apps/studio/src/components/Breadcrumbs.tsx b/apps/studio/src/components/Breadcrumbs.tsx
@@ -23,7 +23,18 @@ function deriveSegments(matches: ReturnType<typeof useMatches>): BreadcrumbSegme
 
     if (routeId === '/' || routeId === '/_layout') continue;
 
-    if (routeId.includes('/runs/$runId/dataset/$dataset')) {
+    if (routeId.includes('/runs/$runId/category/$category')) {
+      if (!segments.some((s) => s.label === params.runId)) {
+        segments.push({
+          label: params.runId ?? 'Run',
+          to: `/runs/${encodeURIComponent(params.runId)}`,
+        });
+      }
+      segments.push({
+        label: params.category ?? 'Category',
+        to: match.pathname,
+      });
+    } else if (routeId.includes('/runs/$runId/dataset/$dataset')) {
       segments.push({
         label: params.dataset ?? 'Dataset',
         to: match.pathname,