EntityProcess · christso · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/packages/core/src/evaluation/evaluators/code-evaluator.ts b/packages/core/src/evaluation/evaluators/code-evaluator.ts
@@ -9,13 +9,91 @@ import {
   createTargetProxy,
 } from '../../runtime/target-proxy.js';
 import { toSnakeCaseDeep } from '../case-conversion.js';
+import { type ContentImage, isContentArray } from '../content.js';
 import type { AssertionEntry, JsonObject, TargetAccessConfig } from '../types.js';
 import { clampScore, isNonEmptyString, parseJsonSafe, scoreToVerdict } from './scoring.js';
 import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
 
 /** Threshold in bytes above which output is written to a temp file instead of inlined. */
 const FILE_BACKED_OUTPUT_THRESHOLD = 50_000;
 
+/** Regex matching `data:<mediaType>;base64,<data>` URIs. */
+const DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
+
+/**
+ * Convert ContentImage blocks in message arrays for code grader consumption.
+ *
+ * - Data URI images (`data:image/png;base64,...`) → decoded, written to temp file, replaced with file path.
+ * - Non-URI images (already a path or URL) → `source` carried through as `path`.
+ * - ContentText, ContentFile blocks → passed through unchanged.
+ * - Messages with plain string content → passed through unchanged.
+ *
+ * Returns the original array when no image blocks exist (zero-copy fast path).
+ */
+export async function materializeContentForGrader(
+  messages: readonly Record<string, unknown>[] | null | undefined,
+  getWorkDir: () => Promise<string>,
+): Promise<readonly Record<string, unknown>[] | null> {
+  if (!messages || messages.length === 0) return messages ?? null;
+
+  // Fast path: skip if no image blocks exist
+  let hasAnyImage = false;
+  for (const msg of messages) {
+    if (isContentArray(msg.content)) {
+      for (const block of msg.content) {
+        if (block.type === 'image') {
+          hasAnyImage = true;
+          break;
+        }
+      }
+    }
+    if (hasAnyImage) break;
+  }
+  if (!hasAnyImage) return messages;
+
+  let counter = 0;
+  const result: Record<string, unknown>[] = [];
+
+  for (const msg of messages) {
+    if (!isContentArray(msg.content)) {
+      result.push(msg);
+      continue;
+    }
+
+    if (!msg.content.some((b) => b.type === 'image')) {
+      result.push(msg);
+      continue;
+    }
+
+    const blocks: Record<string, unknown>[] = [];
+    for (const block of msg.content) {
+      if (block.type !== 'image') {
+        blocks.push({ ...block });
+        continue;
+      }
+
+      const img = block as ContentImage;
+      const match = DATA_URI_RE.exec(img.source);
+
+      if (match) {
+        const [, mediaType, base64Data] = match;
+        const ext = mediaType.split('/')[1] === 'jpeg' ? 'jpg' : (mediaType.split('/')[1] ?? 'bin');
+        const dir = await getWorkDir();
+        const filePath = join(dir, `img-${counter++}.${ext}`);
+        await writeFile(filePath, Buffer.from(base64Data, 'base64'));
+        blocks.push({ type: 'image', media_type: img.media_type, path: filePath });
+      } else {
+        // Already a path or URL → carry through as path
+        blocks.push({ type: 'image', media_type: img.media_type, path: img.source });
+      }
+    }
+
+    result.push({ ...msg, content: blocks });
+  }
+
+  return result;
+}
+
 export interface CodeEvaluatorOptions {
   readonly command: readonly string[];
   /** @deprecated Use `command` instead */
@@ -46,8 +124,23 @@ export class CodeEvaluator implements Evaluator {
   }
 
   async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
+    // Lazy temp dir for materialized image files
+    let imageTmpDir: string | undefined;
+    const getImageDir = async () => {
+      if (!imageTmpDir) {
+        imageTmpDir = await mkdtemp(join(tmpdir(), 'agentv-img-'));
+      }
+      return imageTmpDir;
+    };
+
+    // Materialize multimodal content (data URIs → temp files, source → path)
+    const materializedOutput = await materializeContentForGrader(
+      context.output as readonly Record<string, unknown>[] | undefined,
+      getImageDir,
+    );
+
     // Determine whether to use file-backed output for large payloads
-    let outputForPayload = context.output ?? null;
+    let outputForPayload: readonly Record<string, unknown>[] | null = materializedOutput;
     let outputPath: string | undefined;
 
     if (outputForPayload) {
@@ -63,11 +156,17 @@ export class CodeEvaluator implements Evaluator {
     // Build payload (camelCase internally, converted to snake_case for graders)
     const payload = {
       criteria: context.evalCase.criteria,
-      expectedOutput: context.evalCase.expected_output,
+      expectedOutput: await materializeContentForGrader(
+        context.evalCase.expected_output as readonly Record<string, unknown>[],
+        getImageDir,
+      ),
       output: outputForPayload,
       outputPath,
       inputFiles: context.evalCase.file_paths,
-      input: context.evalCase.input,
+      input: await materializeContentForGrader(
+        context.evalCase.input as readonly Record<string, unknown>[],
+        getImageDir,
+      ),
       trace: context.trace ?? null,
       tokenUsage: context.tokenUsage ?? null,
       costUsd: context.costUsd ?? null,
@@ -196,6 +295,10 @@ export class CodeEvaluator implements Evaluator {
       if (outputPath) {
         await rm(dirname(outputPath), { recursive: true, force: true }).catch(() => {});
       }
+      // Clean up temp dir for materialized images
+      if (imageTmpDir) {
+        await rm(imageTmpDir, { recursive: true, force: true }).catch(() => {});
+      }
     }
   }
 }

diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts
@@ -5,7 +5,7 @@ import type { WriteStream } from 'node:fs';
 import { mkdir } from 'node:fs/promises';
 import path from 'node:path';
 
-import type { Content } from '../content.js';
+import { extractTextContent, toContentArray } from './claude-content.js';
 import { recordClaudeLogEntry } from './claude-log-tracker.js';
 import { buildPromptDocument, normalizeInputFiles } from './preread.js';
 import type { ClaudeResolvedConfig } from './targets.js';
@@ -479,72 +479,6 @@ function summarizeEvent(event: Record<string, unknown>): string | undefined {
   }
 }
 
-/**
- * Convert Claude's content array to Content[] preserving non-text blocks.
- * Returns undefined if content is a plain string or has only text blocks
- * (no benefit over the simpler string representation).
- */
-function toContentArray(content: unknown): Content[] | undefined {
-  if (!Array.isArray(content)) return undefined;
-
-  let hasNonText = false;
-  const blocks: Content[] = [];
-
-  for (const part of content) {
-    if (!part || typeof part !== 'object') continue;
-    const p = part as Record<string, unknown>;
-
-    if (p.type === 'text' && typeof p.text === 'string') {
-      blocks.push({ type: 'text', text: p.text });
-    } else if (p.type === 'image' && typeof p.source === 'object' && p.source !== null) {
-      const src = p.source as Record<string, unknown>;
-      const mediaType =
-        typeof p.media_type === 'string'
-          ? p.media_type
-          : typeof src.media_type === 'string'
-            ? src.media_type
-            : 'application/octet-stream';
-      const data =
-        typeof src.data === 'string'
-          ? `data:${mediaType};base64,${src.data}`
-          : typeof p.url === 'string'
-            ? (p.url as string)
-            : '';
-      blocks.push({ type: 'image', media_type: mediaType, source: data });
-      hasNonText = true;
-    } else if (p.type === 'tool_use') {
-      // tool_use blocks are handled separately as ToolCall — skip
-    } else if (p.type === 'tool_result') {
-      // tool_result blocks are not user content — skip
-    }
-  }
-
-  return hasNonText && blocks.length > 0 ? blocks : undefined;
-}
-
-/**
- * Extract text content from Claude's content array format.
- */
-function extractTextContent(content: unknown): string | undefined {
-  if (typeof content === 'string') {
-    return content;
-  }
-  if (!Array.isArray(content)) {
-    return undefined;
-  }
-  const textParts: string[] = [];
-  for (const part of content) {
-    if (!part || typeof part !== 'object') {
-      continue;
-    }
-    const p = part as Record<string, unknown>;
-    if (p.type === 'text' && typeof p.text === 'string') {
-      textParts.push(p.text);
-    }
-  }
-  return textParts.length > 0 ? textParts.join('\n') : undefined;
-}
-
 /**
  * Extract tool calls from Claude's content array format.
  */

diff --git a/packages/core/src/evaluation/providers/claude-content.ts b/packages/core/src/evaluation/providers/claude-content.ts
@@ -0,0 +1,94 @@
+/**
+ * Shared content-mapping utilities for Claude-based providers.
+ *
+ * Converts Claude's raw content array format (Anthropic API) into the AgentV
+ * Content[] union so that non-text blocks (images) flow through the pipeline
+ * without lossy flattening.
+ *
+ * Used by: claude-cli, claude-sdk, claude (legacy).
+ *
+ * ## Claude content format
+ *
+ * Claude responses use:
+ * ```json
+ * { "content": [
+ *     { "type": "text", "text": "..." },
+ *     { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": "..." } },
+ *     { "type": "tool_use", "name": "...", "input": {...}, "id": "..." }
+ * ]}
+ * ```
+ *
+ * `toContentArray` maps text and image blocks to `Content[]`.
+ * `tool_use` and `tool_result` blocks are handled separately as `ToolCall`.
+ */
+
+import type { Content } from '../content.js';
+
+/**
+ * Convert Claude's raw content array to `Content[]`, preserving non-text blocks.
+ *
+ * Returns `undefined` when the content is a plain string or contains only text
+ * blocks — callers should fall back to the text-only string representation in
+ * that case (no benefit from wrapping plain text in `Content[]`).
+ */
+export function toContentArray(content: unknown): Content[] | undefined {
+  if (!Array.isArray(content)) return undefined;
+
+  let hasNonText = false;
+  const blocks: Content[] = [];
+
+  for (const part of content) {
+    if (!part || typeof part !== 'object') continue;
+    const p = part as Record<string, unknown>;
+
+    if (p.type === 'text' && typeof p.text === 'string') {
+      blocks.push({ type: 'text', text: p.text });
+    } else if (p.type === 'image' && typeof p.source === 'object' && p.source !== null) {
+      const src = p.source as Record<string, unknown>;
+      const mediaType =
+        typeof p.media_type === 'string'
+          ? p.media_type
+          : typeof src.media_type === 'string'
+            ? src.media_type
+            : 'application/octet-stream';
+      const data =
+        typeof src.data === 'string'
+          ? `data:${mediaType};base64,${src.data}`
+          : typeof p.url === 'string'
+            ? (p.url as string)
+            : '';
+      blocks.push({ type: 'image', media_type: mediaType, source: data });
+      hasNonText = true;
+    } else if (p.type === 'tool_use') {
+      // tool_use blocks are handled separately as ToolCall — skip
+    } else if (p.type === 'tool_result') {
+      // tool_result blocks are not user content — skip
+    }
+  }
+
+  return hasNonText && blocks.length > 0 ? blocks : undefined;
+}
+
+/**
+ * Extract text content from Claude's content array format.
+ * Returns joined text from all `type: 'text'` blocks (newline-separated).
+ */
+export function extractTextContent(content: unknown): string | undefined {
+  if (typeof content === 'string') {
+    return content;
+  }
+  if (!Array.isArray(content)) {
+    return undefined;
+  }
+  const textParts: string[] = [];
+  for (const part of content) {
+    if (!part || typeof part !== 'object') {
+      continue;
+    }
+    const p = part as Record<string, unknown>;
+    if (p.type === 'text' && typeof p.text === 'string') {
+      textParts.push(p.text);
+    }
+  }
+  return textParts.length > 0 ? textParts.join('\n') : undefined;
+}
diff --git a/packages/core/src/evaluation/providers/claude-sdk.ts b/packages/core/src/evaluation/providers/claude-sdk.ts
@@ -4,6 +4,7 @@ import type { WriteStream } from 'node:fs';
 import { mkdir } from 'node:fs/promises';
 import path from 'node:path';
 
+import { extractTextContent, toContentArray } from './claude-content.js';
 import { recordClaudeLogEntry } from './claude-log-tracker.js';
 import { buildPromptDocument, normalizeInputFiles } from './preread.js';
 import type { ClaudeResolvedConfig } from './targets.js';
@@ -139,12 +140,13 @@ export class ClaudeSdkProvider implements Provider {
             if (betaMessage && typeof betaMessage === 'object') {
               const msg = betaMessage as Record<string, unknown>;
               const content = msg.content;
+              const structuredContent = toContentArray(content);
               const textContent = extractTextContent(content);
               const toolCalls = extractToolCalls(content);
 
               const outputMsg: Message = {
                 role: 'assistant',
-                content: textContent,
+                content: structuredContent ?? textContent,
                 toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
               };
               output.push(outputMsg);
@@ -280,30 +282,6 @@ export class ClaudeSdkProvider implements Provider {
   }
 }
 
-/**
- * Extract text content from Claude's content array format.
- * Claude uses: content: [{ type: "text", text: "..." }, ...]
- */
-function extractTextContent(content: unknown): string | undefined {
-  if (typeof content === 'string') {
-    return content;
-  }
-  if (!Array.isArray(content)) {
-    return undefined;
-  }
-  const textParts: string[] = [];
-  for (const part of content) {
-    if (!part || typeof part !== 'object') {
-      continue;
-    }
-    const p = part as Record<string, unknown>;
-    if (p.type === 'text' && typeof p.text === 'string') {
-      textParts.push(p.text);
-    }
-  }
-  return textParts.length > 0 ? textParts.join('\n') : undefined;
-}
-
 /**
  * Extract tool calls from Claude's content array format.
  * Claude uses: content: [{ type: "tool_use", name: "...", input: {...}, id: "..." }, ...]