Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 106 additions & 3 deletions packages/core/src/evaluation/evaluators/code-evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,91 @@ import {
createTargetProxy,
} from '../../runtime/target-proxy.js';
import { toSnakeCaseDeep } from '../case-conversion.js';
import { type ContentImage, isContentArray } from '../content.js';
import type { AssertionEntry, JsonObject, TargetAccessConfig } from '../types.js';
import { clampScore, isNonEmptyString, parseJsonSafe, scoreToVerdict } from './scoring.js';
import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';

/** Threshold in bytes above which output is written to a temp file instead of inlined. */
const FILE_BACKED_OUTPUT_THRESHOLD = 50_000;

/** Regex matching `data:<mediaType>;base64,<data>` URIs. */
const DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;

/**
* Convert ContentImage blocks in message arrays for code grader consumption.
*
* - Data URI images (`data:image/png;base64,...`) → decoded, written to temp file, replaced with file path.
* - Non-URI images (already a path or URL) → `source` carried through as `path`.
* - ContentText, ContentFile blocks → passed through unchanged.
* - Messages with plain string content → passed through unchanged.
*
* Returns the original array when no image blocks exist (zero-copy fast path).
*/
export async function materializeContentForGrader(
messages: readonly Record<string, unknown>[] | null | undefined,
getWorkDir: () => Promise<string>,
): Promise<readonly Record<string, unknown>[] | null> {
if (!messages || messages.length === 0) return messages ?? null;

// Fast path: skip if no image blocks exist
let hasAnyImage = false;
for (const msg of messages) {
if (isContentArray(msg.content)) {
for (const block of msg.content) {
if (block.type === 'image') {
hasAnyImage = true;
break;
}
}
}
if (hasAnyImage) break;
}
if (!hasAnyImage) return messages;

let counter = 0;
const result: Record<string, unknown>[] = [];

for (const msg of messages) {
if (!isContentArray(msg.content)) {
result.push(msg);
continue;
}

if (!msg.content.some((b) => b.type === 'image')) {
result.push(msg);
continue;
}

const blocks: Record<string, unknown>[] = [];
for (const block of msg.content) {
if (block.type !== 'image') {
blocks.push({ ...block });
continue;
}

const img = block as ContentImage;
const match = DATA_URI_RE.exec(img.source);

if (match) {
const [, mediaType, base64Data] = match;
const ext = mediaType.split('/')[1] === 'jpeg' ? 'jpg' : (mediaType.split('/')[1] ?? 'bin');
const dir = await getWorkDir();
const filePath = join(dir, `img-${counter++}.${ext}`);
await writeFile(filePath, Buffer.from(base64Data, 'base64'));
blocks.push({ type: 'image', media_type: img.media_type, path: filePath });
} else {
// Already a path or URL → carry through as path
blocks.push({ type: 'image', media_type: img.media_type, path: img.source });
}
}

result.push({ ...msg, content: blocks });
}

return result;
}

export interface CodeEvaluatorOptions {
readonly command: readonly string[];
/** @deprecated Use `command` instead */
Expand Down Expand Up @@ -46,8 +124,23 @@ export class CodeEvaluator implements Evaluator {
}

async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
// Lazy temp dir for materialized image files
let imageTmpDir: string | undefined;
const getImageDir = async () => {
if (!imageTmpDir) {
imageTmpDir = await mkdtemp(join(tmpdir(), 'agentv-img-'));
}
return imageTmpDir;
};

// Materialize multimodal content (data URIs → temp files, source → path)
const materializedOutput = await materializeContentForGrader(
context.output as readonly Record<string, unknown>[] | undefined,
getImageDir,
);

// Determine whether to use file-backed output for large payloads
let outputForPayload = context.output ?? null;
let outputForPayload: readonly Record<string, unknown>[] | null = materializedOutput;
let outputPath: string | undefined;

if (outputForPayload) {
Expand All @@ -63,11 +156,17 @@ export class CodeEvaluator implements Evaluator {
// Build payload (camelCase internally, converted to snake_case for graders)
const payload = {
criteria: context.evalCase.criteria,
expectedOutput: context.evalCase.expected_output,
expectedOutput: await materializeContentForGrader(
context.evalCase.expected_output as readonly Record<string, unknown>[],
getImageDir,
),
output: outputForPayload,
outputPath,
inputFiles: context.evalCase.file_paths,
input: context.evalCase.input,
input: await materializeContentForGrader(
context.evalCase.input as readonly Record<string, unknown>[],
getImageDir,
),
trace: context.trace ?? null,
tokenUsage: context.tokenUsage ?? null,
costUsd: context.costUsd ?? null,
Expand Down Expand Up @@ -196,6 +295,10 @@ export class CodeEvaluator implements Evaluator {
if (outputPath) {
await rm(dirname(outputPath), { recursive: true, force: true }).catch(() => {});
}
// Clean up temp dir for materialized images
if (imageTmpDir) {
await rm(imageTmpDir, { recursive: true, force: true }).catch(() => {});
}
}
}
}
Expand Down
68 changes: 1 addition & 67 deletions packages/core/src/evaluation/providers/claude-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import type { WriteStream } from 'node:fs';
import { mkdir } from 'node:fs/promises';
import path from 'node:path';

import type { Content } from '../content.js';
import { extractTextContent, toContentArray } from './claude-content.js';
import { recordClaudeLogEntry } from './claude-log-tracker.js';
import { buildPromptDocument, normalizeInputFiles } from './preread.js';
import type { ClaudeResolvedConfig } from './targets.js';
Expand Down Expand Up @@ -479,72 +479,6 @@ function summarizeEvent(event: Record<string, unknown>): string | undefined {
}
}

/**
* Convert Claude's content array to Content[] preserving non-text blocks.
* Returns undefined if content is a plain string or has only text blocks
* (no benefit over the simpler string representation).
*/
function toContentArray(content: unknown): Content[] | undefined {
if (!Array.isArray(content)) return undefined;

let hasNonText = false;
const blocks: Content[] = [];

for (const part of content) {
if (!part || typeof part !== 'object') continue;
const p = part as Record<string, unknown>;

if (p.type === 'text' && typeof p.text === 'string') {
blocks.push({ type: 'text', text: p.text });
} else if (p.type === 'image' && typeof p.source === 'object' && p.source !== null) {
const src = p.source as Record<string, unknown>;
const mediaType =
typeof p.media_type === 'string'
? p.media_type
: typeof src.media_type === 'string'
? src.media_type
: 'application/octet-stream';
const data =
typeof src.data === 'string'
? `data:${mediaType};base64,${src.data}`
: typeof p.url === 'string'
? (p.url as string)
: '';
blocks.push({ type: 'image', media_type: mediaType, source: data });
hasNonText = true;
} else if (p.type === 'tool_use') {
// tool_use blocks are handled separately as ToolCall — skip
} else if (p.type === 'tool_result') {
// tool_result blocks are not user content — skip
}
}

return hasNonText && blocks.length > 0 ? blocks : undefined;
}

/**
* Extract text content from Claude's content array format.
*/
function extractTextContent(content: unknown): string | undefined {
if (typeof content === 'string') {
return content;
}
if (!Array.isArray(content)) {
return undefined;
}
const textParts: string[] = [];
for (const part of content) {
if (!part || typeof part !== 'object') {
continue;
}
const p = part as Record<string, unknown>;
if (p.type === 'text' && typeof p.text === 'string') {
textParts.push(p.text);
}
}
return textParts.length > 0 ? textParts.join('\n') : undefined;
}

/**
* Extract tool calls from Claude's content array format.
*/
Expand Down
94 changes: 94 additions & 0 deletions packages/core/src/evaluation/providers/claude-content.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/**
* Shared content-mapping utilities for Claude-based providers.
*
* Converts Claude's raw content array format (Anthropic API) into the AgentV
* Content[] union so that non-text blocks (images) flow through the pipeline
* without lossy flattening.
*
* Used by: claude-cli, claude-sdk, claude (legacy).
*
* ## Claude content format
*
* Claude responses use:
* ```json
* { "content": [
* { "type": "text", "text": "..." },
* { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": "..." } },
* { "type": "tool_use", "name": "...", "input": {...}, "id": "..." }
* ]}
* ```
*
* `toContentArray` maps text and image blocks to `Content[]`.
* `tool_use` and `tool_result` blocks are handled separately as `ToolCall`.
*/

import type { Content } from '../content.js';

/**
* Convert Claude's raw content array to `Content[]`, preserving non-text blocks.
*
* Returns `undefined` when the content is a plain string or contains only text
* blocks — callers should fall back to the text-only string representation in
* that case (no benefit from wrapping plain text in `Content[]`).
*/
export function toContentArray(content: unknown): Content[] | undefined {
if (!Array.isArray(content)) return undefined;

let hasNonText = false;
const blocks: Content[] = [];

for (const part of content) {
if (!part || typeof part !== 'object') continue;
const p = part as Record<string, unknown>;

if (p.type === 'text' && typeof p.text === 'string') {
blocks.push({ type: 'text', text: p.text });
} else if (p.type === 'image' && typeof p.source === 'object' && p.source !== null) {
const src = p.source as Record<string, unknown>;
const mediaType =
typeof p.media_type === 'string'
? p.media_type
: typeof src.media_type === 'string'
? src.media_type
: 'application/octet-stream';
const data =
typeof src.data === 'string'
? `data:${mediaType};base64,${src.data}`
: typeof p.url === 'string'
? (p.url as string)
: '';
blocks.push({ type: 'image', media_type: mediaType, source: data });
hasNonText = true;
} else if (p.type === 'tool_use') {
// tool_use blocks are handled separately as ToolCall — skip
} else if (p.type === 'tool_result') {
// tool_result blocks are not user content — skip
}
}

return hasNonText && blocks.length > 0 ? blocks : undefined;
}

/**
* Extract text content from Claude's content array format.
* Returns joined text from all `type: 'text'` blocks (newline-separated).
*/
export function extractTextContent(content: unknown): string | undefined {
if (typeof content === 'string') {
return content;
}
if (!Array.isArray(content)) {
return undefined;
}
const textParts: string[] = [];
for (const part of content) {
if (!part || typeof part !== 'object') {
continue;
}
const p = part as Record<string, unknown>;
if (p.type === 'text' && typeof p.text === 'string') {
textParts.push(p.text);
}
}
return textParts.length > 0 ? textParts.join('\n') : undefined;
}
28 changes: 3 additions & 25 deletions packages/core/src/evaluation/providers/claude-sdk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import type { WriteStream } from 'node:fs';
import { mkdir } from 'node:fs/promises';
import path from 'node:path';

import { extractTextContent, toContentArray } from './claude-content.js';
import { recordClaudeLogEntry } from './claude-log-tracker.js';
import { buildPromptDocument, normalizeInputFiles } from './preread.js';
import type { ClaudeResolvedConfig } from './targets.js';
Expand Down Expand Up @@ -139,12 +140,13 @@ export class ClaudeSdkProvider implements Provider {
if (betaMessage && typeof betaMessage === 'object') {
const msg = betaMessage as Record<string, unknown>;
const content = msg.content;
const structuredContent = toContentArray(content);
const textContent = extractTextContent(content);
const toolCalls = extractToolCalls(content);

const outputMsg: Message = {
role: 'assistant',
content: textContent,
content: structuredContent ?? textContent,
toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
};
output.push(outputMsg);
Expand Down Expand Up @@ -280,30 +282,6 @@ export class ClaudeSdkProvider implements Provider {
}
}

/**
* Extract text content from Claude's content array format.
* Claude uses: content: [{ type: "text", text: "..." }, ...]
*/
function extractTextContent(content: unknown): string | undefined {
if (typeof content === 'string') {
return content;
}
if (!Array.isArray(content)) {
return undefined;
}
const textParts: string[] = [];
for (const part of content) {
if (!part || typeof part !== 'object') {
continue;
}
const p = part as Record<string, unknown>;
if (p.type === 'text' && typeof p.text === 'string') {
textParts.push(p.text);
}
}
return textParts.length > 0 ? textParts.join('\n') : undefined;
}

/**
* Extract tool calls from Claude's content array format.
* Claude uses: content: [{ type: "tool_use", name: "...", input: {...}, id: "..." }, ...]
Expand Down
Loading
Loading