diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 86d6f17c..a9c3383b 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -126,6 +126,7 @@ interface NormalizedOptions { readonly transcript?: string; readonly experiment?: string; readonly budgetUsd?: number; + readonly sourceMetadataByEvalFile?: ReadonlyMap>; } function normalizeBoolean(value: unknown): boolean { @@ -197,6 +198,35 @@ function normalizeFilter(value: unknown): string | readonly string[] | undefined return normalizeString(value); } +function normalizeSourceMetadataByEvalFile( + value: unknown, +): ReadonlyMap> | undefined { + if (value instanceof Map) { + const entries = [...value.entries()].filter( + (entry): entry is [string, Record] => + typeof entry[0] === 'string' && + typeof entry[1] === 'object' && + entry[1] !== null && + !Array.isArray(entry[1]), + ); + return entries.length > 0 + ? new Map(entries.map(([key, metadata]) => [path.resolve(key), metadata])) + : undefined; + } + + if (typeof value === 'object' && value !== null && !Array.isArray(value)) { + const entries = Object.entries(value).filter( + (entry): entry is [string, Record] => + typeof entry[1] === 'object' && entry[1] !== null && !Array.isArray(entry[1]), + ); + return entries.length > 0 + ? new Map(entries.map(([key, metadata]) => [path.resolve(key), metadata])) + : undefined; + } + + return undefined; +} + /** * Check whether an eval file's tags satisfy --tag / --exclude-tag filters. * @@ -404,9 +434,30 @@ function normalizeOptions( transcript: normalizeString(rawOptions.transcript), experiment: normalizeString(rawOptions.experiment), budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd), + sourceMetadataByEvalFile: normalizeSourceMetadataByEvalFile( + rawOptions.sourceMetadataByEvalFile, + ), } satisfies NormalizedOptions; } +function withSourceMetadata( + result: EvaluationResult, + testFilePath: string, + options: NormalizedOptions, +): EvaluationResult { + const sourceMetadata = options.sourceMetadataByEvalFile?.get(path.resolve(testFilePath)); + if (!sourceMetadata) { + return result; + } + return { + ...result, + metadata: { + ...result.metadata, + ...sourceMetadata, + }, + }; +} + async function ensureFileExists(filePath: string, description: string): Promise { try { await access(filePath, constants.F_OK); @@ -919,9 +970,10 @@ async function runSingleEvalFile(params: { // Trim output messages for results JSONL based on --output-messages. // Each message is trimmed to { role, content } only (no toolCalls, startTime, etc.). // Full output with tool calls goes to OTel. - const trimmedOutput = trimOutputMessages(result.output, options.outputMessages); + const resultWithMetadata = withSourceMetadata(result, testFilePath, options); + const trimmedOutput = trimOutputMessages(resultWithMetadata.output, options.outputMessages); const trimmedResult: EvaluationResult = { - ...result, + ...resultWithMetadata, output: trimmedOutput, }; await outputWriter.append(trimmedResult); @@ -976,7 +1028,7 @@ async function runSingleEvalFile(params: { }, }); - return { results: [...results] }; + return { results: results.map((result) => withSourceMetadata(result, testFilePath, options)) }; } export interface RunEvalResult { @@ -1529,9 +1581,11 @@ export async function runEvalCommand( target: selection.targetName, })); for (const r of skippedResults) { - await outputWriter.append(r); + await outputWriter.append(withSourceMetadata(r, testFilePath, options)); } - allResults.push(...skippedResults); + allResults.push( + ...skippedResults.map((r) => withSourceMetadata(r, testFilePath, options)), + ); } continue; } @@ -1614,21 +1668,27 @@ export async function runEvalCommand( console.error( `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`, ); - const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) => ({ - timestamp: new Date().toISOString(), - testId: testCase.id, - score: 0, - assertions: [], - output: [], - scores: [], - error: message, - executionStatus: 'execution_error' as const, - failureStage: 'setup' as const, - failureReasonCode: 'setup_error' as const, - durationMs: 0, - tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 }, - target: selection.targetName, - })); + const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) => + withSourceMetadata( + { + timestamp: new Date().toISOString(), + testId: testCase.id, + score: 0, + assertions: [], + output: [], + scores: [], + error: message, + executionStatus: 'execution_error' as const, + failureStage: 'setup' as const, + failureReasonCode: 'setup_error' as const, + durationMs: 0, + tokenUsage: { input: 0, output: 0 }, + target: selection.targetName, + }, + testFilePath, + options, + ), + ); for (const errResult of errorResults) { await outputWriter.append(errResult); } diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 2bbdaeac..1ec21561 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -33,11 +33,13 @@ export interface ResultManifestRecord { readonly input_path?: string; readonly output_path?: string; readonly response_path?: string; + readonly artifact_dir?: string; readonly task_dir?: string; readonly eval_path?: string; readonly targets_path?: string; readonly files_path?: string; readonly graders_path?: string; + readonly metadata?: Record; } function parseJsonlLines(content: string): T[] { @@ -175,6 +177,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E costUsd: record.cost_usd, input: hydrateInput(baseDir, record), output: hydrateOutput(baseDir, record), + metadata: record.metadata, } as EvaluationResult; } diff --git a/apps/cli/src/commands/runs/index.ts b/apps/cli/src/commands/runs/index.ts new file mode 100644 index 00000000..5cb78f04 --- /dev/null +++ b/apps/cli/src/commands/runs/index.ts @@ -0,0 +1,11 @@ +import { subcommands } from 'cmd-ts'; + +import { runsRerunCommand } from './rerun.js'; + +export const runsCommand = subcommands({ + name: 'runs', + description: 'Operate on captured run workspaces', + cmds: { + rerun: runsRerunCommand, + }, +}); diff --git a/apps/cli/src/commands/runs/rerun.ts b/apps/cli/src/commands/runs/rerun.ts new file mode 100644 index 00000000..8e775191 --- /dev/null +++ b/apps/cli/src/commands/runs/rerun.ts @@ -0,0 +1,524 @@ +import { constants } from 'node:fs'; +import { access, readFile } from 'node:fs/promises'; +import path from 'node:path'; + +import { parseYamlValue } from '@agentv/core'; +import { + array, + command, + flag, + multioption, + number, + option, + optional, + positional, + string, +} from 'cmd-ts'; +import { config as loadDotenv } from 'dotenv'; + +import { + buildDefaultRunDir, + createRunDirName, + resolveRunManifestPath, +} from '../eval/result-layout.js'; +import { runEvalCommand } from '../eval/run-eval.js'; +import { type ResultManifestRecord, parseResultManifest } from '../results/manifest.js'; + +const TASK_EVAL_FILENAME = 'EVAL.yaml'; +const TASK_TARGETS_FILENAME = 'targets.yaml'; +const ENV_REF_PATTERN = /\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g; + +interface SelectedTaskBundle { + readonly record: ResultManifestRecord; + readonly testId: string; + readonly sourceTarget: string; + readonly artifactDir: string; + readonly taskDir: string; + readonly evalPath: string; + readonly targetsPath: string; + readonly taskTarget: string; +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function displayRecord(record: ResultManifestRecord): string { + return `${record.test_id ?? 'unknown'}@${record.target ?? 'unknown'}`; +} + +function resolveSourcePath(cwd: string, source: string): string { + return path.isAbsolute(source) ? source : path.resolve(cwd, source); +} + +function resolveRelativeRunPath( + runDir: string, + relativePath: string | undefined, +): string | undefined { + if (!relativePath || relativePath.trim().length === 0) { + return undefined; + } + return path.resolve(runDir, relativePath); +} + +async function ensureFile(filePath: string, label: string): Promise { + try { + await access(filePath, constants.F_OK); + } catch { + throw new Error(`${label} not found: ${filePath}`); + } +} + +function matchesAny(value: string, patterns: readonly string[]): boolean { + return patterns.length === 0 || patterns.some((pattern) => matchesGlob(value, pattern)); +} + +function matchesGlob(value: string, pattern: string): boolean { + let source = ''; + for (const char of pattern) { + if (char === '*') { + source += '.*'; + } else if (char === '?') { + source += '.'; + } else { + source += char.replace(/[.+^${}()|[\]\\]/g, '\\$&'); + } + } + return new RegExp(`^${source}$`).test(value); +} + +function stringArray(value: unknown): readonly string[] { + return Array.isArray(value) + ? value.filter((entry): entry is string => typeof entry === 'string' && entry.length > 0) + : []; +} + +function readExecutionTarget(parsedEval: unknown): string | undefined { + if (!isRecord(parsedEval)) { + return undefined; + } + const execution = parsedEval.execution; + if (isRecord(execution) && typeof execution.target === 'string' && execution.target.length > 0) { + return execution.target; + } + return typeof parsedEval.target === 'string' && parsedEval.target.length > 0 + ? parsedEval.target + : undefined; +} + +async function readTaskTarget(evalPath: string, fallback: string): Promise { + const raw = await readFile(evalPath, 'utf8'); + return readExecutionTarget(parseYamlValue(raw)) ?? fallback; +} + +async function readTargetDefinitions( + targetsPath: string, +): Promise[]> { + const parsed = parseYamlValue(await readFile(targetsPath, 'utf8')); + if (!isRecord(parsed) || !Array.isArray(parsed.targets)) { + throw new Error(`Targets file is missing a top-level targets array: ${targetsPath}`); + } + return parsed.targets.filter(isRecord); +} + +function targetName(definition: Record): string | undefined { + return typeof definition.name === 'string' && definition.name.trim().length > 0 + ? definition.name.trim() + : undefined; +} + +function resolveWholeEnvReference(value: unknown): string | undefined { + if (typeof value !== 'string') { + return undefined; + } + const match = value.trim().match(/^\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}$/); + if (!match) { + return value.trim().length > 0 ? value.trim() : undefined; + } + const resolved = process.env[match[1]]; + return resolved && resolved.trim().length > 0 ? resolved.trim() : undefined; +} + +function referencedTargetNames(definition: Record): readonly string[] { + const names: string[] = []; + for (const key of ['use_target', 'grader_target', 'judge_target'] as const) { + const resolved = resolveWholeEnvReference(definition[key]); + if (resolved && !resolved.includes('${{')) { + names.push(resolved); + } + } + const fallbackTargets = definition.fallback_targets; + if (Array.isArray(fallbackTargets)) { + for (const entry of fallbackTargets) { + const resolved = resolveWholeEnvReference(entry); + if (resolved && !resolved.includes('${{')) { + names.push(resolved); + } + } + } + return names; +} + +function collectEnvRefs(value: unknown, names = new Set()): Set { + if (typeof value === 'string') { + for (const match of value.matchAll(ENV_REF_PATTERN)) { + if (match[1]) { + names.add(match[1]); + } + } + return names; + } + if (Array.isArray(value)) { + for (const entry of value) { + collectEnvRefs(entry, names); + } + return names; + } + if (isRecord(value)) { + for (const [key, entry] of Object.entries(value)) { + if (key === 'required_env') { + for (const required of stringArray(entry)) { + names.add(required); + } + } + collectEnvRefs(entry, names); + } + } + return names; +} + +async function validateTargetFile( + targetsPath: string, + targetNames: readonly string[], + label: string, +): Promise { + const definitions = await readTargetDefinitions(targetsPath); + const byName = new Map>(); + for (const definition of definitions) { + const name = targetName(definition); + if (name) { + byName.set(name, definition); + } + } + + const missingTargets = [...new Set(targetNames)].filter((name) => !byName.has(name)); + if (missingTargets.length > 0) { + throw new Error( + `${label} is incompatible: ${targetsPath} does not define target(s): ${missingTargets.join( + ', ', + )}`, + ); + } + + const envRefs = new Set(); + const seenTargets = new Set(); + const visit = (name: string) => { + if (seenTargets.has(name)) { + return; + } + const definition = byName.get(name); + if (!definition) { + return; + } + seenTargets.add(name); + collectEnvRefs(definition, envRefs); + for (const referencedName of referencedTargetNames(definition)) { + visit(referencedName); + } + }; + for (const name of targetNames) { + visit(name); + } + + const missingEnv = [...envRefs].filter((name) => { + const value = process.env[name]; + return value === undefined || value.trim().length === 0; + }); + if (missingEnv.length > 0) { + throw new Error( + `Missing environment variable(s) required by ${targetsPath}: ${missingEnv.join( + ', ', + )}. Provide --env-file or export them before rerun.`, + ); + } +} + +function isInsideOrSame(root: string, candidate: string): boolean { + const relative = path.relative(path.resolve(root), path.resolve(candidate)); + return relative === '' || (!relative.startsWith('..') && !path.isAbsolute(relative)); +} + +function forbiddenOutputRoots( + sourceRunDir: string, + selected: readonly SelectedTaskBundle[], +): readonly string[] { + return [ + path.resolve(sourceRunDir), + ...selected.flatMap((bundle) => [ + path.resolve(bundle.artifactDir), + path.resolve(bundle.taskDir), + ]), + ]; +} + +function assertOutputIsSeparate(outputDir: string, roots: readonly string[]): void { + const forbiddenRoot = roots.find((root) => isInsideOrSame(root, outputDir)); + if (!forbiddenRoot) { + return; + } + throw new Error( + `Refusing to write rerun output inside the source bundle. Output: ${outputDir}; source: ${forbiddenRoot}`, + ); +} + +function defaultOutputDir( + cwd: string, + sourceRunDir: string, + selected: readonly SelectedTaskBundle[], + experiment?: string, +): string { + const roots = forbiddenOutputRoots(sourceRunDir, selected); + const candidate = buildDefaultRunDir(cwd, experiment ?? 'rerun'); + if (!roots.some((root) => isInsideOrSame(root, candidate))) { + return candidate; + } + return path.join(path.dirname(path.resolve(sourceRunDir)), `rerun-${createRunDirName()}`); +} + +async function loadEnvFile( + envFile: string | undefined, + cwd: string, + verbose: boolean, +): Promise { + if (!envFile) { + return; + } + const resolved = path.isAbsolute(envFile) ? envFile : path.resolve(cwd, envFile); + await ensureFile(resolved, 'Environment file'); + const loaded = loadDotenv({ path: resolved, override: false }); + if (loaded.error) { + throw loaded.error; + } + if (verbose) { + console.log(`Loaded environment from: ${resolved}`); + } +} + +async function loadSelectedTaskBundles(options: { + readonly indexPath: string; + readonly sourceRunDir: string; + readonly testIds: readonly string[]; + readonly sourceTargets: readonly string[]; +}): Promise { + const content = await readFile(options.indexPath, 'utf8'); + const records = parseResultManifest(content); + if (records.length === 0) { + throw new Error(`Run manifest contains no result rows: ${options.indexPath}`); + } + + const selected: SelectedTaskBundle[] = []; + for (const record of records) { + const testId = record.test_id ?? 'unknown'; + const sourceTarget = record.target ?? 'unknown'; + if (!matchesAny(testId, options.testIds) || !matchesAny(sourceTarget, options.sourceTargets)) { + continue; + } + + const recordLabel = displayRecord(record); + const evalPath = + resolveRelativeRunPath(options.sourceRunDir, record.eval_path) ?? + resolveRelativeRunPath( + options.sourceRunDir, + record.task_dir && `${record.task_dir}/${TASK_EVAL_FILENAME}`, + ); + const targetsPath = + resolveRelativeRunPath(options.sourceRunDir, record.targets_path) ?? + resolveRelativeRunPath( + options.sourceRunDir, + record.task_dir && `${record.task_dir}/${TASK_TARGETS_FILENAME}`, + ); + const taskDir = + resolveRelativeRunPath(options.sourceRunDir, record.task_dir) ?? + (evalPath ? path.dirname(evalPath) : undefined); + const artifactDir = + resolveRelativeRunPath(options.sourceRunDir, record.artifact_dir) ?? + (taskDir ? path.dirname(taskDir) : undefined); + + if (!evalPath || !targetsPath || !taskDir || !artifactDir) { + throw new Error( + `Selected result ${recordLabel} is missing task bundle paths. Re-run requires task/EVAL.yaml and task/targets.yaml.`, + ); + } + + await ensureFile(evalPath, `Task eval for ${recordLabel}`); + await ensureFile(targetsPath, `Task targets for ${recordLabel}`); + const taskTarget = await readTaskTarget(evalPath, sourceTarget); + selected.push({ + record, + testId, + sourceTarget, + artifactDir, + taskDir, + evalPath, + targetsPath, + taskTarget, + }); + } + + if (selected.length === 0) { + throw new Error( + 'No captured task bundles matched the provided --test-id/--source-target filters.', + ); + } + return selected; +} + +function buildSourceMetadataByEvalFile( + sourceRunDir: string, + indexPath: string, + selected: readonly SelectedTaskBundle[], +): ReadonlyMap> { + return new Map( + selected.map((bundle) => [ + path.resolve(bundle.evalPath), + { + rerunSource: { + mode: 'rerun', + sourceRunDir: path.resolve(sourceRunDir), + sourceIndexPath: path.resolve(indexPath), + sourceArtifactDir: path.resolve(bundle.artifactDir), + sourceTaskDir: path.resolve(bundle.taskDir), + sourceTestId: bundle.testId, + sourceTarget: bundle.sourceTarget, + sourceTimestamp: bundle.record.timestamp, + }, + }, + ]), + ); +} + +export const runsRerunCommand = command({ + name: 'rerun', + description: 'Rerun captured task bundles with local target environment', + args: { + runDir: positional({ + type: string, + displayName: 'run-dir', + description: 'Run workspace directory or index.jsonl manifest containing task bundles', + }), + testId: multioption({ + type: array(string), + long: 'test-id', + description: 'Only rerun captured test ID(s); glob supported, repeatable', + }), + sourceTarget: multioption({ + type: array(string), + long: 'source-target', + description: 'Only rerun captured source target(s); glob supported, repeatable', + }), + target: multioption({ + type: array(string), + long: 'target', + description: 'Override target name(s) for the new eval run', + }), + targets: option({ + type: optional(string), + long: 'targets', + description: 'Path to replacement targets.yaml for the new eval run', + }), + envFile: option({ + type: optional(string), + long: 'env-file', + description: 'Load local environment variables from a dotenv file before rerun', + }), + output: option({ + type: optional(string), + long: 'output', + short: 'o', + description: 'Artifact directory for the new rerun output', + }), + experiment: option({ + type: optional(string), + long: 'experiment', + description: 'Experiment label for default rerun output (default: rerun)', + }), + workers: option({ + type: optional(number), + long: 'workers', + description: 'Number of parallel test cases within each task eval file', + }), + dryRun: flag({ + long: 'dry-run', + description: 'Use mock provider responses instead of real provider calls', + }), + verbose: flag({ + long: 'verbose', + description: 'Enable verbose logging', + }), + }, + handler: async (args) => { + const cwd = process.cwd(); + const indexPath = resolveRunManifestPath(resolveSourcePath(cwd, args.runDir)); + const sourceRunDir = path.dirname(indexPath); + + await loadEnvFile(args.envFile, cwd, args.verbose); + + const selected = await loadSelectedTaskBundles({ + indexPath, + sourceRunDir, + testIds: args.testId, + sourceTargets: args.sourceTarget, + }); + + const targetOverrides = args.target; + const outputDir = args.output + ? path.resolve(cwd, args.output) + : defaultOutputDir(cwd, sourceRunDir, selected, args.experiment); + assertOutputIsSeparate(outputDir, forbiddenOutputRoots(sourceRunDir, selected)); + + if (args.targets) { + const overrideTargetsPath = path.resolve(cwd, args.targets); + await ensureFile(overrideTargetsPath, 'Target override'); + const targetNames = + targetOverrides.length > 0 ? targetOverrides : selected.map((bundle) => bundle.taskTarget); + await validateTargetFile(overrideTargetsPath, targetNames, 'Target override'); + } else { + const targetNamesByFile = new Map>(); + for (const bundle of selected) { + const targetNames = targetOverrides.length > 0 ? targetOverrides : [bundle.taskTarget]; + const names = targetNamesByFile.get(bundle.targetsPath) ?? new Set(); + for (const targetName of targetNames) { + names.add(targetName); + } + targetNamesByFile.set(bundle.targetsPath, names); + } + for (const [targetsPath, names] of targetNamesByFile.entries()) { + await validateTargetFile(targetsPath, [...names], 'Task bundle targets'); + } + } + + console.log(`Rerunning ${selected.length} captured task bundle(s) from: ${sourceRunDir}`); + console.log(`Rerun output directory: ${outputDir}`); + + const result = await runEvalCommand({ + testFiles: selected.map((bundle) => bundle.evalPath), + rawOptions: { + target: targetOverrides, + targets: args.targets ? path.resolve(cwd, args.targets) : undefined, + output: outputDir, + experiment: args.experiment ?? 'rerun', + workers: args.workers, + dryRun: args.dryRun, + verbose: args.verbose, + sourceMetadataByEvalFile: buildSourceMetadataByEvalFile(sourceRunDir, indexPath, selected), + }, + }); + + if (result?.allExecutionErrors) { + process.exit(2); + } + if (result?.budgetExceeded || result?.thresholdFailed) { + process.exit(1); + } + }, +}); diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index ab3d37ac..f55f0b27 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -15,6 +15,7 @@ import { inspectCommand } from './commands/inspect/index.js'; import { pipelineCommand } from './commands/pipeline/index.js'; import { resultsCommand } from './commands/results/index.js'; import { resultsServeCommand } from './commands/results/serve.js'; +import { runsCommand } from './commands/runs/index.js'; import { selfCommand } from './commands/self/index.js'; import { skillsCommand } from './commands/skills/index.js'; import { transpileCommand } from './commands/transpile/index.js'; @@ -39,6 +40,7 @@ export const app = subcommands({ init: initCmdTsCommand, pipeline: pipelineCommand, results: resultsCommand, + runs: runsCommand, self: selfCommand, skills: skillsCommand, serve: resultsServeCommand, @@ -72,6 +74,7 @@ const TOP_LEVEL_COMMANDS = new Set([ 'init', 'pipeline', 'results', + 'runs', 'self', 'skills', 'serve', diff --git a/apps/cli/test/commands/runs/rerun.test.ts b/apps/cli/test/commands/runs/rerun.test.ts new file mode 100644 index 00000000..9a6e3929 --- /dev/null +++ b/apps/cli/test/commands/runs/rerun.test.ts @@ -0,0 +1,362 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { execa } from 'execa'; +import { assertCoreBuild } from '../../setup-core-build.js'; + +assertCoreBuild(); + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); +const projectRoot = path.resolve(__dirname, '../../../../..'); +const CLI_ENTRY = path.join(projectRoot, 'apps/cli/src/cli.ts'); +const MOCK_RUNNER = path.join(projectRoot, 'apps/cli/test/fixtures/mock-run-evaluation.ts'); + +interface BundleFixture { + readonly baseDir: string; + readonly cwd: string; + readonly sourceRunDir: string; + readonly outputDir: string; + readonly envFile: string; + readonly overrideTargetsPath: string; +} + +interface CliResult { + readonly stdout: string; + readonly stderr: string; + readonly exitCode: number; +} + +const DEFAULT_TARGETS = `targets: + - name: captured + provider: mock +`; + +async function writeTaskBundle(options: { + readonly sourceRunDir: string; + readonly testId: string; + readonly targetsYaml: string; +}): Promise> { + const artifactDir = path.join(options.sourceRunDir, options.testId); + const taskDir = path.join(artifactDir, 'task'); + const outputsDir = path.join(artifactDir, 'outputs'); + await mkdir(taskDir, { recursive: true }); + await mkdir(outputsDir, { recursive: true }); + + await writeFile( + path.join(taskDir, 'EVAL.yaml'), + `execution: + target: captured +tests: + - id: ${options.testId} + input: + - role: user + content: Prompt for ${options.testId} + expected_output: [] +`, + 'utf8', + ); + await writeFile(path.join(taskDir, 'targets.yaml'), options.targetsYaml, 'utf8'); + await writeFile(path.join(artifactDir, 'grading.json'), '{"assertions":[]}\n', 'utf8'); + await writeFile(path.join(artifactDir, 'timing.json'), '{"duration_ms":1}\n', 'utf8'); + await writeFile(path.join(outputsDir, 'response.md'), '@[assistant]:\nCaptured answer\n', 'utf8'); + + return { + timestamp: '2024-01-01T00:00:00.000Z', + test_id: options.testId, + target: 'captured', + score: 0.1, + artifact_dir: options.testId, + grading_path: `${options.testId}/grading.json`, + timing_path: `${options.testId}/timing.json`, + output_path: `${options.testId}/outputs/response.md`, + response_path: `${options.testId}/outputs/response.md`, + task_dir: `${options.testId}/task`, + eval_path: `${options.testId}/task/EVAL.yaml`, + targets_path: `${options.testId}/task/targets.yaml`, + }; +} + +async function createBundleFixture(targetsYaml = DEFAULT_TARGETS): Promise { + const baseDir = await mkdtemp(path.join(tmpdir(), 'agentv-rerun-')); + const cwd = path.join(baseDir, 'workspace'); + const sourceRunDir = path.join(baseDir, 'source-run'); + const outputDir = path.join(baseDir, 'rerun-output'); + await mkdir(cwd, { recursive: true }); + await mkdir(sourceRunDir, { recursive: true }); + + const records = [ + await writeTaskBundle({ sourceRunDir, testId: 'case-alpha', targetsYaml }), + await writeTaskBundle({ sourceRunDir, testId: 'case-beta', targetsYaml }), + ]; + await writeFile( + path.join(sourceRunDir, 'index.jsonl'), + `${records.map((record) => JSON.stringify(record)).join('\n')}\n`, + 'utf8', + ); + + const envFile = path.join(baseDir, 'local.env'); + await writeFile(envFile, 'LOCAL_AGENT_COMMAND=echo local-agent\n', 'utf8'); + const overrideTargetsPath = path.join(baseDir, 'override-targets.yaml'); + await writeFile( + overrideTargetsPath, + `targets: + - name: local + provider: mock +`, + 'utf8', + ); + + return { baseDir, cwd, sourceRunDir, outputDir, envFile, overrideTargetsPath }; +} + +async function runCli( + fixture: BundleFixture, + args: readonly string[], + options?: { readonly cwd?: string; readonly env?: Record }, +): Promise { + const result = await execa('bun', ['--no-env-file', CLI_ENTRY, ...args], { + cwd: options?.cwd ?? fixture.cwd, + env: { + ...process.env, + AGENTV_NO_UPDATE_CHECK: '1', + CI: 'true', + LOCAL_AGENT_COMMAND: undefined, + AGENTEVO_CLI_EVAL_RUNNER: MOCK_RUNNER, + ...options?.env, + }, + reject: false, + }); + return { + stdout: result.stdout, + stderr: result.stderr, + exitCode: result.exitCode ?? 0, + }; +} + +async function readJsonLines(filePath: string): Promise[]> { + const raw = await readFile(filePath, 'utf8'); + return raw + .split(/\r?\n/) + .map((line) => line.trim()) + .filter((line) => line.length > 0) + .map((line) => JSON.parse(line) as Record); +} + +function extractRerunOutputDir(stdout: string): string { + const line = stdout.split(/\r?\n/).find((entry) => entry.startsWith('Rerun output directory:')); + if (!line) { + throw new Error(`Missing rerun output line:\n${stdout}`); + } + return line.replace('Rerun output directory:', '').trim(); +} + +describe('agentv runs rerun', () => { + let fixtures: BundleFixture[] = []; + + beforeEach(() => { + fixtures = []; + }); + + afterEach(async () => { + await Promise.all( + fixtures.map((fixture) => rm(fixture.baseDir, { recursive: true, force: true })), + ); + }); + + async function fixture(targetsYaml?: string): Promise { + const created = await createBundleFixture(targetsYaml); + fixtures.push(created); + return created; + } + + it('reruns captured task bundles into an explicit output directory with source metadata', async () => { + const created = await fixture(); + + const result = await runCli(created, [ + 'runs', + 'rerun', + created.sourceRunDir, + '--output', + created.outputDir, + '--verbose', + ]); + + expect(result.exitCode).toBe(0); + expect(result.stdout).toContain('Rerunning 2 captured task bundle(s)'); + const rows = await readJsonLines(path.join(created.outputDir, 'index.jsonl')); + expect(rows.map((row) => row.test_id)).toEqual(['case-alpha', 'case-beta']); + expect(rows.every((row) => row.target === 'captured')).toBe(true); + expect(rows[0].metadata).toMatchObject({ + rerun_source: { + mode: 'rerun', + source_test_id: 'case-alpha', + source_target: 'captured', + }, + }); + + const responsePath = path.join(created.outputDir, String(rows[0].response_path)); + const response = await readFile(responsePath, 'utf8'); + expect(response).toContain('Alpha answer'); + expect(response).not.toContain('Captured answer'); + }, 30_000); + + it('fails clearly for missing env and accepts an explicit env file', async () => { + const created = await fixture(`targets: + - name: captured + provider: cli + command: \${{ LOCAL_AGENT_COMMAND }} +`); + + const missing = await runCli(created, [ + 'runs', + 'rerun', + created.sourceRunDir, + '--output', + created.outputDir, + '--dry-run', + ]); + expect(missing.exitCode).toBe(1); + expect(missing.stderr).toContain('Missing environment variable(s)'); + expect(missing.stderr).toContain('LOCAL_AGENT_COMMAND'); + + const withAmbientEnv = await runCli( + created, + ['runs', 'rerun', created.sourceRunDir, '--output', created.outputDir, '--dry-run'], + { env: { LOCAL_AGENT_COMMAND: 'echo ambient-agent' } }, + ); + expect(withAmbientEnv.exitCode).toBe(0); + + const withEnvFile = await runCli(created, [ + 'runs', + 'rerun', + created.sourceRunDir, + '--output', + created.outputDir, + '--env-file', + created.envFile, + '--dry-run', + ]); + expect(withEnvFile.exitCode).toBe(0); + }, 30_000); + + it('fails loudly when selected bundle artifacts are missing', async () => { + const created = await fixture(); + await rm(path.join(created.sourceRunDir, 'case-beta', 'task', 'targets.yaml')); + + const result = await runCli(created, [ + 'runs', + 'rerun', + created.sourceRunDir, + '--test-id', + 'case-beta', + '--output', + created.outputDir, + ]); + + expect(result.exitCode).toBe(1); + expect(result.stderr).toContain('Task targets for case-beta@captured not found'); + }, 30_000); + + it('reruns a selected test subset from index.jsonl', async () => { + const created = await fixture(); + + const result = await runCli(created, [ + 'runs', + 'rerun', + created.sourceRunDir, + '--test-id', + 'case-alpha', + '--output', + created.outputDir, + ]); + + expect(result.exitCode).toBe(0); + const rows = await readJsonLines(path.join(created.outputDir, 'index.jsonl')); + expect(rows.map((row) => row.test_id)).toEqual(['case-alpha']); + }, 30_000); + + it('chooses a default output directory outside the source task folder', async () => { + const created = await fixture(); + const taskDir = path.join(created.sourceRunDir, 'case-alpha', 'task'); + + const result = await runCli( + created, + ['runs', 'rerun', created.sourceRunDir, '--test-id', 'case-alpha'], + { cwd: taskDir }, + ); + + expect(result.exitCode).toBe(0); + const outputDir = extractRerunOutputDir(result.stdout); + expect(path.relative(taskDir, outputDir).startsWith('..')).toBe(true); + const rows = await readJsonLines(path.join(outputDir, 'index.jsonl')); + expect(rows.map((row) => row.test_id)).toEqual(['case-alpha']); + }, 30_000); + + it('rejects explicit output nested under a source task folder', async () => { + const created = await fixture(); + const nestedOutput = path.join( + created.sourceRunDir, + 'case-alpha', + 'task', + '.agentv', + 'results', + ); + + const result = await runCli(created, [ + 'runs', + 'rerun', + created.sourceRunDir, + '--test-id', + 'case-alpha', + '--output', + nestedOutput, + ]); + + expect(result.exitCode).toBe(1); + expect(result.stderr).toContain('Refusing to write rerun output inside the source bundle'); + }, 30_000); + + it('fails loudly for incompatible target overrides', async () => { + const created = await fixture(); + + const result = await runCli(created, [ + 'runs', + 'rerun', + created.sourceRunDir, + '--targets', + created.overrideTargetsPath, + '--target', + 'missing', + '--output', + created.outputDir, + '--dry-run', + ]); + + expect(result.exitCode).toBe(1); + expect(result.stderr).toContain('Target override is incompatible'); + expect(result.stderr).toContain('missing'); + }, 30_000); + + it('accepts a compatible target override file and target selection', async () => { + const created = await fixture(); + + const result = await runCli(created, [ + 'runs', + 'rerun', + created.sourceRunDir, + '--targets', + created.overrideTargetsPath, + '--target', + 'local', + '--output', + created.outputDir, + ]); + + expect(result.exitCode).toBe(0); + const rows = await readJsonLines(path.join(created.outputDir, 'index.jsonl')); + expect(rows.every((row) => row.target === 'local')).toBe(true); + }, 30_000); +}); diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts index da198883..5f92fee9 100644 --- a/apps/cli/test/fixtures/mock-run-evaluation.ts +++ b/apps/cli/test/fixtures/mock-run-evaluation.ts @@ -47,10 +47,26 @@ interface EvaluationResultLike { readonly timestamp: string; } -function buildResults(targetName: string): EvaluationResultLike[] { +function evalCaseIds(evalCases: ReadonlyArray | undefined): readonly string[] { + if (!Array.isArray(evalCases) || evalCases.length === 0) { + return ['case-alpha', 'case-beta']; + } + return evalCases + .map((evalCase) => + evalCase && + typeof evalCase === 'object' && + 'id' in evalCase && + typeof evalCase.id === 'string' + ? evalCase.id + : undefined, + ) + .filter((id): id is string => id !== undefined); +} + +function buildResult(targetName: string, testId: string, index: number): EvaluationResultLike { const baseTime = new Date('2024-01-01T00:00:00.000Z'); - return [ - { + if (testId === 'case-alpha') { + return { testId: 'case-alpha', score: 0.6, assertions: [{ text: 'alpha', passed: true }], @@ -58,8 +74,10 @@ function buildResults(targetName: string): EvaluationResultLike[] { expectedAspectCount: 1, target: targetName, timestamp: baseTime.toISOString(), - }, - { + }; + } + if (testId === 'case-beta') { + return { testId: 'case-beta', score: 0.9, assertions: [ @@ -71,8 +89,24 @@ function buildResults(targetName: string): EvaluationResultLike[] { expectedAspectCount: 3, target: targetName, timestamp: new Date(baseTime.getTime() + 60_000).toISOString(), - }, - ]; + }; + } + return { + testId, + score: 1, + assertions: [{ text: testId, passed: true }], + output: [{ role: 'assistant', content: `${testId} answer` }], + expectedAspectCount: 1, + target: targetName, + timestamp: new Date(baseTime.getTime() + index * 60_000).toISOString(), + }; +} + +function buildResults( + targetName: string, + evalCases: ReadonlyArray | undefined, +): EvaluationResultLike[] { + return evalCaseIds(evalCases).map((testId, index) => buildResult(targetName, testId, index)); } async function maybeWriteDiagnostics( @@ -135,7 +169,7 @@ async function maybeWritePromptDump( export async function runEvaluation( options: RunEvaluationOptionsLike, ): Promise { - const results = buildResults(options.target?.name ?? 'unknown-target'); + const results = buildResults(options.target?.name ?? 'unknown-target', options.evalCases); await maybeWriteDiagnostics(options, results); await maybeWritePromptDump(