Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ export interface IndexArtifactEntry {
readonly timestamp: string;
readonly test_id: string;
readonly dataset?: string;
readonly category?: string;
readonly conversation_id?: string;
readonly score: number;
readonly target: string;
Expand Down Expand Up @@ -508,6 +509,7 @@ export function buildIndexArtifactEntry(
timestamp: result.timestamp,
test_id: result.testId ?? 'unknown',
dataset: getDataset(result),
category: result.category,
conversation_id: result.conversationId,
score: result.score,
target: result.target ?? 'unknown',
Expand Down Expand Up @@ -539,6 +541,7 @@ export function buildResultIndexArtifact(result: EvaluationResult): ResultIndexA
timestamp: result.timestamp,
test_id: result.testId ?? 'unknown',
dataset: getDataset(result),
category: result.category,
conversation_id: result.conversationId,
score: result.score,
target: result.target ?? 'unknown',
Expand Down
17 changes: 1 addition & 16 deletions apps/cli/src/commands/eval/discover.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import path from 'node:path';
import { DEFAULT_EVAL_PATTERNS, loadConfig } from '@agentv/core';
import { DEFAULT_EVAL_PATTERNS, deriveCategory, loadConfig } from '@agentv/core';
import fg from 'fast-glob';

import { findRepoRoot } from './shared.js';
Expand Down Expand Up @@ -52,21 +52,6 @@ export async function discoverEvalFiles(cwd: string): Promise<readonly Discovere
return evalFiles;
}

/** Derive a human-readable category from the relative path. */
function deriveCategory(relativePath: string): string {
const parts = relativePath.split(path.sep);
// Use the first meaningful directory as category
// e.g., "examples/showcase/export-screening/evals/dataset.eval.yaml" → "showcase/export-screening"
// e.g., "evals/dataset.eval.yaml" → "evals"
if (parts.length <= 1) {
return 'root';
}

// Remove the filename and "evals" folder if present
const dirs = parts.slice(0, -1).filter((d) => d !== 'evals');
return dirs.length > 0 ? dirs.join('/') : 'root';
}

/** Get unique categories from discovered eval files. */
export function getCategories(files: readonly DiscoveredEvalFile[]): readonly string[] {
const categories = new Set<string>();
Expand Down
5 changes: 5 additions & 0 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import {
ResponseCache,
type TrialsConfig,
runEvaluation as defaultRunEvaluation,
deriveCategory,
ensureVSCodeSubagents,
loadConfig,
loadTestSuite,
Expand Down Expand Up @@ -444,9 +445,13 @@ async function prepareFileMetadata(params: {
verbose: options.verbose,
});

const relativePath = path.relative(cwd, testFilePath);
const category = deriveCategory(relativePath);

const suite = await loadTestSuite(testFilePath, repoRoot, {
verbose: options.verbose,
filter: options.filter,
category,
});
const filteredIds = suite.tests.map((value) => value.id);

Expand Down
7 changes: 4 additions & 3 deletions apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
*/
import { readFile } from 'node:fs/promises';
import { mkdir, writeFile } from 'node:fs/promises';
import { dirname, join, resolve } from 'node:path';
import { dirname, join, relative, resolve } from 'node:path';

import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
import { loadTestSuite } from '@agentv/core';
import { deriveCategory, loadTestSuite } from '@agentv/core';
import { command, option, optional, positional, string } from 'cmd-ts';

import { buildDefaultRunDir } from '../eval/result-layout.js';
Expand Down Expand Up @@ -57,7 +57,8 @@ export const evalInputCommand = command({
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
const evalDir = dirname(resolvedEvalPath);

const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
const tests = suite.tests;

if (tests.length === 0) {
Expand Down
7 changes: 4 additions & 3 deletions apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ import { execSync } from 'node:child_process';
import { existsSync, readFileSync, unlinkSync } from 'node:fs';
import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { dirname, join, resolve } from 'node:path';
import { dirname, join, relative, resolve } from 'node:path';

import { executeScript, loadTestSuite } from '@agentv/core';
import { deriveCategory, executeScript, loadTestSuite } from '@agentv/core';
import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
import { command, number, option, optional, positional, string } from 'cmd-ts';

Expand Down Expand Up @@ -91,7 +91,8 @@ export const evalRunCommand = command({
const evalDir = dirname(resolvedEvalPath);

// ── Step 1: Extract inputs (same as pipeline input) ──────────────
const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
const tests = suite.tests;

if (tests.length === 0) {
Expand Down
2 changes: 2 additions & 0 deletions apps/cli/src/commands/results/manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export interface ResultManifestRecord {
readonly test_id?: string;
readonly eval_id?: string;
readonly dataset?: string;
readonly category?: string;
readonly experiment?: string;
readonly target?: string;
readonly score: number;
Expand Down Expand Up @@ -125,6 +126,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E
timestamp: record.timestamp,
testId,
dataset: record.dataset,
category: record.category,
target: record.target,
score: record.score,
executionStatus: record.execution_status,
Expand Down
78 changes: 77 additions & 1 deletion apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { command, number, option, optional, positional, string } from 'cmd-ts';

import type { EvaluationResult } from '@agentv/core';
import { DEFAULT_CATEGORY, type EvaluationResult } from '@agentv/core';
import { Hono } from 'hono';

import { parseJsonlResults } from '../eval/artifact-writer.js';
Expand Down Expand Up @@ -304,6 +304,82 @@ export function createApp(
}
});

// Category summaries for a run
app.get('/api/runs/:filename/categories', (c) => {
const filename = c.req.param('filename');
const metas = listResultFiles(searchDir);
const meta = metas.find((m) => m.filename === filename);
if (!meta) {
return c.json({ error: 'Run not found' }, 404);
}
try {
const loaded = patchTestIds(loadManifestResults(meta.path));
const categoryMap = new Map<
string,
{ total: number; passed: number; scoreSum: number; datasets: Set<string> }
>();
for (const r of loaded) {
const cat = r.category ?? DEFAULT_CATEGORY;
const entry = categoryMap.get(cat) ?? {
total: 0,
passed: 0,
scoreSum: 0,
datasets: new Set<string>(),
};
entry.total++;
if (r.score >= 1) entry.passed++;
entry.scoreSum += r.score;
entry.datasets.add(r.dataset ?? r.target ?? 'default');
categoryMap.set(cat, entry);
}
const categories = [...categoryMap.entries()].map(([name, entry]) => ({
name,
total: entry.total,
passed: entry.passed,
failed: entry.total - entry.passed,
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
dataset_count: entry.datasets.size,
}));
return c.json({ categories });
} catch {
return c.json({ error: 'Failed to load categories' }, 500);
}
});

// Datasets within a category for a run
app.get('/api/runs/:filename/categories/:category/datasets', (c) => {
const filename = c.req.param('filename');
const category = decodeURIComponent(c.req.param('category'));
const metas = listResultFiles(searchDir);
const meta = metas.find((m) => m.filename === filename);
if (!meta) {
return c.json({ error: 'Run not found' }, 404);
}
try {
const loaded = patchTestIds(loadManifestResults(meta.path));
const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
const datasetMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
for (const r of filtered) {
const ds = r.dataset ?? r.target ?? 'default';
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
entry.total++;
if (r.score >= 1) entry.passed++;
entry.scoreSum += r.score;
datasetMap.set(ds, entry);
}
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
name,
total: entry.total,
passed: entry.passed,
failed: entry.total - entry.passed,
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
}));
return c.json({ datasets });
} catch {
return c.json({ error: 'Failed to load datasets' }, 500);
}
});

// Full eval detail with hydrated artifacts
app.get('/api/runs/:filename/evals/:evalId', (c) => {
const filename = c.req.param('filename');
Expand Down
13 changes: 12 additions & 1 deletion apps/studio/src/components/Breadcrumbs.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,18 @@ function deriveSegments(matches: ReturnType<typeof useMatches>): BreadcrumbSegme

if (routeId === '/' || routeId === '/_layout') continue;

if (routeId.includes('/runs/$runId/dataset/$dataset')) {
if (routeId.includes('/runs/$runId/category/$category')) {
if (!segments.some((s) => s.label === params.runId)) {
segments.push({
label: params.runId ?? 'Run',
to: `/runs/${encodeURIComponent(params.runId)}`,
});
}
segments.push({
label: params.category ?? 'Category',
to: match.pathname,
});
} else if (routeId.includes('/runs/$runId/dataset/$dataset')) {
segments.push({
label: params.dataset ?? 'Dataset',
to: match.pathname,
Expand Down
Loading
Loading