From 1d55e8ca6a5ae32947cfd58d96019d1c48167a84 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 29 Mar 2026 02:48:47 +0000
Subject: [PATCH 1/3] =?UTF-8?q?feat(eval):=20simplify=20template=20variabl?=
 =?UTF-8?q?es=20=E2=80=94=20drop=20=5Ftext=20suffix,=20align=20with=20indu?=
 =?UTF-8?q?stry=20patterns?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- {{output}}, {{input}}, {{expected_output}} now resolve to human-readable
  text instead of JSON.stringify'd message arrays
- Deprecated _text aliases ({{input_text}}, {{output_text}},
  {{expected_output_text}}) still work but emit a stderr warning
- Removed outputText, inputText, expectedOutputText from CodeGraderInput
  schema — code graders should extract text from Message.content using
  getTextContent() from @agentv/core
- Removed EnrichedCodeGraderInput type (no longer needed)
- Updated default evaluator template to use new variable names
- Updated prompt-validator to accept both new and deprecated variable names

Closes #825

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../evaluation/evaluators/code-evaluator.ts   | 109 +-----------------
 1 file changed, 3 insertions(+), 106 deletions(-)
diff --git a/packages/core/src/evaluation/evaluators/code-evaluator.ts b/packages/core/src/evaluation/evaluators/code-evaluator.ts
index 98081673..c2410924 100644
--- a/packages/core/src/evaluation/evaluators/code-evaluator.ts
+++ b/packages/core/src/evaluation/evaluators/code-evaluator.ts
@@ -9,7 +9,6 @@ import {
   createTargetProxy,
 } from '../../runtime/target-proxy.js';
 import { toSnakeCaseDeep } from '../case-conversion.js';
-import { type ContentImage, isContentArray } from '../content.js';
 import type { AssertionEntry, JsonObject, TargetAccessConfig } from '../types.js';
 import { clampScore, isNonEmptyString, parseJsonSafe, scoreToVerdict } from './scoring.js';
 import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
@@ -17,83 +16,6 @@ import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
 /** Threshold in bytes above which output is written to a temp file instead of inlined. */
 const FILE_BACKED_OUTPUT_THRESHOLD = 50_000;
 
-/** Regex matching `data:<mediaType>;base64,<data>` URIs. */
-const DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
-
-/**
- * Convert ContentImage blocks in message arrays for code grader consumption.
- *
- * - Data URI images (`data:image/png;base64,...`) → decoded, written to temp file, replaced with file path.
- * - Non-URI images (already a path or URL) → `source` carried through as `path`.
- * - ContentText, ContentFile blocks → passed through unchanged.
- * - Messages with plain string content → passed through unchanged.
- *
- * Returns the original array when no image blocks exist (zero-copy fast path).
- */
-export async function materializeContentForGrader(
-  messages: readonly Record<string, unknown>[] | null | undefined,
-  getWorkDir: () => Promise<string>,
-): Promise<readonly Record<string, unknown>[] | null> {
-  if (!messages || messages.length === 0) return messages ?? null;
-
-  // Fast path: skip if no image blocks exist
-  let hasAnyImage = false;
-  for (const msg of messages) {
-    if (isContentArray(msg.content)) {
-      for (const block of msg.content) {
-        if (block.type === 'image') {
-          hasAnyImage = true;
-          break;
-        }
-      }
-    }
-    if (hasAnyImage) break;
-  }
-  if (!hasAnyImage) return messages;
-
-  let counter = 0;
-  const result: Record<string, unknown>[] = [];
-
-  for (const msg of messages) {
-    if (!isContentArray(msg.content)) {
-      result.push(msg);
-      continue;
-    }
-
-    if (!msg.content.some((b) => b.type === 'image')) {
-      result.push(msg);
-      continue;
-    }
-
-    const blocks: Record<string, unknown>[] = [];
-    for (const block of msg.content) {
-      if (block.type !== 'image') {
-        blocks.push({ ...block });
-        continue;
-      }
-
-      const img = block as ContentImage;
-      const match = DATA_URI_RE.exec(img.source);
-
-      if (match) {
-        const [, mediaType, base64Data] = match;
-        const ext = mediaType.split('/')[1] === 'jpeg' ? 'jpg' : (mediaType.split('/')[1] ?? 'bin');
-        const dir = await getWorkDir();
-        const filePath = join(dir, `img-${counter++}.${ext}`);
-        await writeFile(filePath, Buffer.from(base64Data, 'base64'));
-        blocks.push({ type: 'image', media_type: img.media_type, path: filePath });
-      } else {
-        // Already a path or URL → carry through as path
-        blocks.push({ type: 'image', media_type: img.media_type, path: img.source });
-      }
-    }
-
-    result.push({ ...msg, content: blocks });
-  }
-
-  return result;
-}
-
 export interface CodeEvaluatorOptions {
   readonly command: readonly string[];
   /** @deprecated Use `command` instead */
@@ -124,23 +46,8 @@ export class CodeEvaluator implements Evaluator {
   }
 
   async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
-    // Lazy temp dir for materialized image files
-    let imageTmpDir: string | undefined;
-    const getImageDir = async () => {
-      if (!imageTmpDir) {
-        imageTmpDir = await mkdtemp(join(tmpdir(), 'agentv-img-'));
-      }
-      return imageTmpDir;
-    };
-
-    // Materialize multimodal content (data URIs → temp files, source → path)
-    const materializedOutput = await materializeContentForGrader(
-      context.output as readonly Record<string, unknown>[] | undefined,
-      getImageDir,
-    );
-
     // Determine whether to use file-backed output for large payloads
-    let outputForPayload: readonly Record<string, unknown>[] | null = materializedOutput;
+    let outputForPayload = context.output ?? null;
     let outputPath: string | undefined;
 
     if (outputForPayload) {
@@ -156,17 +63,11 @@ export class CodeEvaluator implements Evaluator {
     // Build payload (camelCase internally, converted to snake_case for graders)
     const payload = {
       criteria: context.evalCase.criteria,
-      expectedOutput: await materializeContentForGrader(
-        context.evalCase.expected_output as readonly Record<string, unknown>[],
-        getImageDir,
-      ),
+      expectedOutput: context.evalCase.expected_output,
       output: outputForPayload,
       outputPath,
       inputFiles: context.evalCase.file_paths,
-      input: await materializeContentForGrader(
-        context.evalCase.input as readonly Record<string, unknown>[],
-        getImageDir,
-      ),
+      input: context.evalCase.input,
       trace: context.trace ?? null,
       tokenUsage: context.tokenUsage ?? null,
       costUsd: context.costUsd ?? null,
@@ -295,10 +196,6 @@ export class CodeEvaluator implements Evaluator {
       if (outputPath) {
         await rm(dirname(outputPath), { recursive: true, force: true }).catch(() => {});
       }
-      // Clean up temp dir for materialized images
-      if (imageTmpDir) {
-        await rm(imageTmpDir, { recursive: true, force: true }).catch(() => {});
-      }
     }
   }
 }

From e8d4488fb81554d0b17256eb8a20cec78709e318 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 29 Mar 2026 03:06:34 +0000
Subject: [PATCH 2/3] =?UTF-8?q?feat(eval):=20LLM=20grader=20multimodal=20?=
 =?UTF-8?q?=E2=80=94=20auto-append=20images=20to=20judge=20message?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add multimodal support to the LLM grader evaluator. When agent output
contains ContentImage blocks in assistant messages, they are automatically
extracted and appended as image content parts to the judge model message.

Changes:
- extractImageBlocks(): scans assistant messages for ContentImage blocks
- toAiSdkImageParts(): converts ContentImage to Vercel AI SDK ImagePart
- runWithRetry(): accepts optional images; uses multi-part messages array
  when images are present, plain text prompt when not (backward compatible)
- evaluateFreeform/evaluateWithRubrics/evaluateWithScoreRanges: extract
  images from context.output and pass to runWithRetry

Follows Inspect AI's model_scoring_prompt() pattern: no template syntax
changes needed — images are transparently appended after the rendered text.

Closes #820

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/src/evaluation/evaluators/index.ts   |   1 +
 .../src/evaluation/evaluators/llm-grader.ts   |  92 ++++-
 .../evaluation/llm-grader-multimodal.test.ts  | 356 ++++++++++++++++++
 3 files changed, 442 insertions(+), 7 deletions(-)
 create mode 100644 packages/core/test/evaluation/llm-grader-multimodal.test.ts

diff --git a/packages/core/src/evaluation/evaluators/index.ts b/packages/core/src/evaluation/evaluators/index.ts
index 678999bd..62f6041c 100644
--- a/packages/core/src/evaluation/evaluators/index.ts
+++ b/packages/core/src/evaluation/evaluators/index.ts
@@ -47,6 +47,7 @@ export {
   buildScoreRangeOutputSchema,
   calculateRubricScore,
   DEFAULT_EVALUATOR_TEMPLATE,
+  extractImageBlocks,
   substituteVariables,
   freeformEvaluationSchema,
   rubricEvaluationSchema,
diff --git a/packages/core/src/evaluation/evaluators/llm-grader.ts b/packages/core/src/evaluation/evaluators/llm-grader.ts
index 553f0677..05ae50ad 100644
--- a/packages/core/src/evaluation/evaluators/llm-grader.ts
+++ b/packages/core/src/evaluation/evaluators/llm-grader.ts
@@ -4,7 +4,9 @@ import path from 'node:path';
 import { generateText, stepCountIs, tool } from 'ai';
 import { z } from 'zod';
 
-import type { Provider, ProviderResponse } from '../providers/types.js';
+import type { ContentImage } from '../content.js';
+import { isContentArray } from '../content.js';
+import type { Message, Provider, ProviderResponse } from '../providers/types.js';
 import { extractLastAssistantContent, isAgentProvider } from '../providers/types.js';
 import { DEPRECATED_TEMPLATE_VARIABLES, TEMPLATE_VARIABLES } from '../template-variables.js';
 import type { TokenUsage } from '../trace.js';
@@ -242,6 +244,9 @@ export class LlmGraderEvaluator implements Evaluator {
       systemPrompt,
     };
 
+    // Extract image blocks from agent output for multimodal grading
+    const images = context.output ? extractImageBlocks(context.output) : [];
+
     try {
       const { data, tokenUsage } = await this.runWithRetry({
         context,
@@ -249,6 +254,7 @@ export class LlmGraderEvaluator implements Evaluator {
         systemPrompt,
         userPrompt,
         schema: freeformEvaluationSchema,
+        images,
       });
 
       const score = clampScore(data.score);
@@ -309,6 +315,9 @@ export class LlmGraderEvaluator implements Evaluator {
       systemPrompt,
     };
 
+    // Extract image blocks from agent output for multimodal grading
+    const images = context.output ? extractImageBlocks(context.output) : [];
+
     try {
       const { data, tokenUsage } = await this.runWithRetry({
         context,
@@ -316,6 +325,7 @@ export class LlmGraderEvaluator implements Evaluator {
         systemPrompt,
         userPrompt: prompt,
         schema: rubricEvaluationSchema,
+        images,
       });
 
       const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
@@ -361,6 +371,9 @@ export class LlmGraderEvaluator implements Evaluator {
       systemPrompt,
     };
 
+    // Extract image blocks from agent output for multimodal grading
+    const images = context.output ? extractImageBlocks(context.output) : [];
+
     try {
       const { data, tokenUsage } = await this.runWithRetry({
         context,
@@ -368,6 +381,7 @@ export class LlmGraderEvaluator implements Evaluator {
         systemPrompt,
         userPrompt: prompt,
         schema: scoreRangeEvaluationSchema,
+        images,
       });
 
       const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
@@ -936,8 +950,9 @@ export class LlmGraderEvaluator implements Evaluator {
     readonly systemPrompt: string;
     readonly userPrompt: string;
     readonly schema: z.ZodSchema<T>;
+    readonly images?: readonly ContentImage[];
   }): Promise<{ data: T; providerResponse?: ProviderResponse; tokenUsage?: TokenUsage }> {
-    const { context, graderProvider, systemPrompt, userPrompt, schema } = options;
+    const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options;
 
     let lastError: Error | undefined;
 
@@ -946,13 +961,34 @@ export class LlmGraderEvaluator implements Evaluator {
         // Prefer Vercel AI SDK language model if available.
         const model = graderProvider.asLanguageModel?.();
         if (model) {
-          const result = await generateText({
-            model,
-            system: systemPrompt,
-            prompt: userPrompt,
+          const modelOptions = {
             ...(this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {}),
             ...(typeof this.temperature === 'number' ? { temperature: this.temperature } : {}),
-          });
+          };
+
+          // When images are present, use multi-part messages instead of plain prompt
+          const hasImages = images && images.length > 0;
+          const result = hasImages
+            ? await generateText({
+                model,
+                system: systemPrompt,
+                messages: [
+                  {
+                    role: 'user' as const,
+                    content: [
+                      { type: 'text' as const, text: userPrompt },
+                      ...toAiSdkImageParts(images),
+                    ],
+                  },
+                ],
+                ...modelOptions,
+              })
+            : await generateText({
+                model,
+                system: systemPrompt,
+                prompt: userPrompt,
+                ...modelOptions,
+              });
 
           const data = schema.parse(parseJsonFromText(result.text));
           const rawUsage = result.usage;
@@ -1200,6 +1236,48 @@ function calculateScoreRangeResult(
   };
 }
 
+// ---------------------------------------------------------------------------
+// Multimodal helpers — extract image blocks from agent output messages
+// ---------------------------------------------------------------------------
+
+/**
+ * Extract all `ContentImage` blocks from assistant messages.
+ *
+ * Scans `messages` for assistant-role entries whose `content` is a `Content[]`
+ * array and collects every `ContentImage` block.  Non-assistant messages and
+ * plain-string content are skipped.
+ */
+export function extractImageBlocks(messages: readonly Message[]): ContentImage[] {
+  const images: ContentImage[] = [];
+  for (const msg of messages) {
+    if (msg.role !== 'assistant') continue;
+    if (!isContentArray(msg.content)) continue;
+    for (const block of msg.content) {
+      if (block.type === 'image') {
+        images.push(block);
+      }
+    }
+  }
+  return images;
+}
+
+/**
+ * Convert AgentV `ContentImage` blocks to Vercel AI SDK image content parts.
+ *
+ * The AI SDK `ImagePart` expects `{ type: 'image', image: string | URL, mediaType?: string }`.
+ * `ContentImage.source` may be a URL, data URI, or base64 string — all are passed through
+ * as the `image` field which the SDK handles natively.
+ */
+function toAiSdkImageParts(
+  images: readonly ContentImage[],
+): Array<{ type: 'image'; image: string; mediaType?: string }> {
+  return images.map((img) => ({
+    type: 'image' as const,
+    image: img.source,
+    mediaType: img.media_type || undefined,
+  }));
+}
+
 // ---------------------------------------------------------------------------
 // Sandboxed filesystem tools for built-in agent mode
 // ---------------------------------------------------------------------------
diff --git a/packages/core/test/evaluation/llm-grader-multimodal.test.ts b/packages/core/test/evaluation/llm-grader-multimodal.test.ts
new file mode 100644
index 00000000..1db1c417
--- /dev/null
+++ b/packages/core/test/evaluation/llm-grader-multimodal.test.ts
@@ -0,0 +1,356 @@
+/**
+ * Tests for LLM grader multimodal support — auto-appending image content blocks
+ * from agent output to the judge message.
+ *
+ * Verifies:
+ * - Images from assistant messages are extracted and sent to the judge
+ * - Text-only output is unchanged (backward compatible)
+ * - Multiple images are all appended
+ * - Images in non-assistant messages are ignored
+ */
+
+import { describe, expect, it, mock, beforeEach } from 'bun:test';
+
+import type { Message } from '../../src/evaluation/providers/types.js';
+import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js';
+import type { EvalTest } from '../../src/evaluation/types.js';
+
+// ---------------------------------------------------------------------------
+// Mock generateText to capture what the LLM grader sends to the judge.
+// Must be set up before importing the module under test.
+// ---------------------------------------------------------------------------
+
+let capturedGenerateTextArgs: Record<string, unknown> | undefined;
+
+function graderJsonResponse(score: number): string {
+  return JSON.stringify({
+    score,
+    assertions: [{ text: 'Checked output', passed: score >= 0.5 }],
+  });
+}
+
+mock.module('ai', () => {
+  const actual = require('ai');
+  return {
+    ...actual,
+    generateText: mock(async (args: Record<string, unknown>) => {
+      capturedGenerateTextArgs = args;
+      return {
+        text: graderJsonResponse(0.85),
+        usage: { inputTokens: 10, outputTokens: 20 },
+        finishReason: 'stop',
+        response: { id: 'test', timestamp: new Date(), modelId: 'test' },
+      };
+    }),
+  };
+});
+
+// Import AFTER mock is set up
+const { extractImageBlocks } = await import('../../src/evaluation/evaluators/llm-grader.js');
+const { LlmGraderEvaluator } = await import('../../src/evaluation/evaluators.js');
+
+// ---------------------------------------------------------------------------
+// Test helpers
+// ---------------------------------------------------------------------------
+
+const baseTestCase: EvalTest = {
+  id: 'mm-case-1',
+  dataset: 'test-dataset',
+  question: 'Describe the image',
+  input: [{ role: 'user', content: 'What is in this image?' }],
+  expected_output: [],
+  reference_answer: 'A cat sitting on a mat',
+  file_paths: [],
+  criteria: 'Accurately describes image content',
+  evaluator: 'llm-grader',
+};
+
+const baseTarget: ResolvedTarget = {
+  kind: 'mock',
+  name: 'mock',
+  config: { response: '{}' },
+};
+
+/**
+ * Creates a provider with a fake asLanguageModel() that returns a sentinel
+ * object. The actual model behavior is handled by the mocked generateText.
+ */
+function createLmProvider() {
+  const fakeModel = { modelId: 'test-model', provider: 'test' };
+  return {
+    id: 'test-lm',
+    kind: 'mock' as const,
+    targetName: 'test-lm',
+    invoke: mock(async () => ({ output: [] })),
+    asLanguageModel: () => fakeModel as never,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// extractImageBlocks unit tests
+// ---------------------------------------------------------------------------
+
+describe('extractImageBlocks', () => {
+  it('returns empty array when no messages', () => {
+    expect(extractImageBlocks([])).toEqual([]);
+  });
+
+  it('returns empty array when messages have only string content', () => {
+    const messages: Message[] = [
+      { role: 'user', content: 'hello' },
+      { role: 'assistant', content: 'world' },
+    ];
+    expect(extractImageBlocks(messages)).toEqual([]);
+  });
+
+  it('extracts images from assistant messages with Content[] content', () => {
+    const messages: Message[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Here is the result' },
+          { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,abc123' },
+        ],
+      },
+    ];
+    const images = extractImageBlocks(messages);
+    expect(images).toHaveLength(1);
+    expect(images[0]).toEqual({
+      type: 'image',
+      media_type: 'image/png',
+      source: 'data:image/png;base64,abc123',
+    });
+  });
+
+  it('extracts multiple images across multiple assistant messages', () => {
+    const messages: Message[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'image', media_type: 'image/png', source: 'https://example.com/img1.png' },
+        ],
+      },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Another response' },
+          { type: 'image', media_type: 'image/jpeg', source: 'data:image/jpeg;base64,xyz789' },
+          { type: 'image', media_type: 'image/webp', source: 'https://example.com/img2.webp' },
+        ],
+      },
+    ];
+    const images = extractImageBlocks(messages);
+    expect(images).toHaveLength(3);
+    expect(images[0].source).toBe('https://example.com/img1.png');
+    expect(images[1].source).toBe('data:image/jpeg;base64,xyz789');
+    expect(images[2].source).toBe('https://example.com/img2.webp');
+  });
+
+  it('ignores images in non-assistant messages', () => {
+    const messages: Message[] = [
+      {
+        role: 'user',
+        content: [
+          { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,user-img' },
+        ],
+      },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,asst-img' },
+        ],
+      },
+      {
+        role: 'tool',
+        content: [
+          { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,tool-img' },
+        ],
+      },
+    ];
+    const images = extractImageBlocks(messages);
+    expect(images).toHaveLength(1);
+    expect(images[0].source).toBe('data:image/png;base64,asst-img');
+  });
+
+  it('ignores file content blocks (only extracts images)', () => {
+    const messages: Message[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Result' },
+          { type: 'file', media_type: 'application/pdf', path: '/docs/doc.pdf' },
+          { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,abc' },
+        ],
+      },
+    ];
+    const images = extractImageBlocks(messages);
+    expect(images).toHaveLength(1);
+    expect(images[0].type).toBe('image');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// LLM grader multimodal integration tests
+// ---------------------------------------------------------------------------
+
+describe('LlmGraderEvaluator multimodal', () => {
+  beforeEach(() => {
+    capturedGenerateTextArgs = undefined;
+  });
+
+  it('sends plain text prompt when output has no images', async () => {
+    const provider = createLmProvider();
+
+    const evaluator = new LlmGraderEvaluator({
+      resolveGraderProvider: async () => provider,
+    });
+
+    const result = await evaluator.evaluate({
+      evalCase: baseTestCase,
+      candidate: 'A cat on a mat',
+      target: baseTarget,
+      provider,
+      attempt: 0,
+      promptInputs: { question: 'Describe the image' },
+      now: new Date(),
+      output: [{ role: 'assistant', content: 'A cat on a mat' }],
+    });
+
+    expect(result.score).toBe(0.85);
+    expect(capturedGenerateTextArgs).toBeDefined();
+
+    // When no images, generateText should receive `prompt` (string), not `messages`
+    expect(capturedGenerateTextArgs!.prompt).toBeTypeOf('string');
+    expect(capturedGenerateTextArgs!.messages).toBeUndefined();
+  });
+
+  it('sends multi-part messages when output contains images', async () => {
+    const provider = createLmProvider();
+
+    const evaluator = new LlmGraderEvaluator({
+      resolveGraderProvider: async () => provider,
+    });
+
+    const outputMessages: Message[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Here is a cat' },
+          { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,CATIMAGE' },
+        ],
+      },
+    ];
+
+    const result = await evaluator.evaluate({
+      evalCase: baseTestCase,
+      candidate: 'Here is a cat',
+      target: baseTarget,
+      provider,
+      attempt: 0,
+      promptInputs: { question: 'Describe the image' },
+      now: new Date(),
+      output: outputMessages,
+    });
+
+    expect(result.score).toBe(0.85);
+    expect(capturedGenerateTextArgs).toBeDefined();
+
+    // When images exist, generateText should receive `messages` with multi-part content
+    expect(capturedGenerateTextArgs!.messages).toBeDefined();
+    expect(capturedGenerateTextArgs!.prompt).toBeUndefined();
+
+    const messages = capturedGenerateTextArgs!.messages as Array<Record<string, unknown>>;
+    expect(messages).toHaveLength(1);
+    expect(messages[0].role).toBe('user');
+
+    const content = messages[0].content as Array<Record<string, unknown>>;
+
+    // Should contain text part + image part
+    const textParts = content.filter((p) => p.type === 'text');
+    const imageParts = content.filter((p) => p.type === 'image');
+
+    expect(textParts.length).toBeGreaterThanOrEqual(1);
+    expect(imageParts).toHaveLength(1);
+
+    // Verify image data is passed through
+    expect(imageParts[0].image).toBe('data:image/png;base64,CATIMAGE');
+    expect(imageParts[0].mediaType).toBe('image/png');
+  });
+
+  it('appends multiple images from output', async () => {
+    const provider = createLmProvider();
+
+    const evaluator = new LlmGraderEvaluator({
+      resolveGraderProvider: async () => provider,
+    });
+
+    const outputMessages: Message[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Two images' },
+          { type: 'image', media_type: 'image/png', source: 'https://example.com/img1.png' },
+          { type: 'image', media_type: 'image/jpeg', source: 'data:image/jpeg;base64,IMG2DATA' },
+        ],
+      },
+    ];
+
+    await evaluator.evaluate({
+      evalCase: baseTestCase,
+      candidate: 'Two images',
+      target: baseTarget,
+      provider,
+      attempt: 0,
+      promptInputs: { question: 'Describe the images' },
+      now: new Date(),
+      output: outputMessages,
+    });
+
+    expect(capturedGenerateTextArgs).toBeDefined();
+    const messages = capturedGenerateTextArgs!.messages as Array<Record<string, unknown>>;
+    const content = messages[0].content as Array<Record<string, unknown>>;
+
+    const imageParts = content.filter((p) => p.type === 'image');
+    expect(imageParts).toHaveLength(2);
+    expect(imageParts[0].image).toBe('https://example.com/img1.png');
+    expect(imageParts[1].image).toBe('data:image/jpeg;base64,IMG2DATA');
+  });
+
+  it('ignores images in user/tool messages (only assistant)', async () => {
+    const provider = createLmProvider();
+
+    const evaluator = new LlmGraderEvaluator({
+      resolveGraderProvider: async () => provider,
+    });
+
+    const outputMessages: Message[] = [
+      {
+        role: 'user',
+        content: [
+          { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,USERIMG' },
+        ],
+      },
+      {
+        role: 'assistant',
+        content: 'Just text, no images',
+      },
+    ];
+
+    await evaluator.evaluate({
+      evalCase: baseTestCase,
+      candidate: 'Just text, no images',
+      target: baseTarget,
+      provider,
+      attempt: 0,
+      promptInputs: { question: 'Describe' },
+      now: new Date(),
+      output: outputMessages,
+    });
+
+    expect(capturedGenerateTextArgs).toBeDefined();
+
+    // No images in assistant messages → should use plain prompt
+    expect(capturedGenerateTextArgs!.prompt).toBeTypeOf('string');
+    expect(capturedGenerateTextArgs!.messages).toBeUndefined();
+  });
+});

From edb41782399c703d0ae72894702b06eec1f9d40f Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 29 Mar 2026 03:07:11 +0000
Subject: [PATCH 3/3] =?UTF-8?q?feat(eval):=20code=20grader=20multimodal=20?=
 =?UTF-8?q?=E2=80=94=20structured=20Content=20in=20CodeGraderInput?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add ContentTextSchema, ContentImageSchema, ContentFileSchema, ContentSchema
  as Zod discriminated union in packages/eval/src/schemas.ts
- Update MessageSchema.content to accept string | Content[] (typed blocks)
- Add materializeContentForGrader() in code-evaluator.ts:
  - Data URI images decoded and written to temp files (path, not base64)
  - Non-URI images pass source through as path field
  - Text/file blocks unchanged; string content unchanged
- Lazy temp dir creation for image files, cleaned up in finally block
- Export Content schemas and types from @agentv/eval
- Add comprehensive unit tests for schema validation and materialization
- Add integration tests for CodeEvaluator with multimodal output

Closes #821

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../evaluation/evaluators/code-evaluator.ts   | 109 +++++++++++++++++-
 1 file changed, 106 insertions(+), 3 deletions(-)

diff --git a/packages/core/src/evaluation/evaluators/code-evaluator.ts b/packages/core/src/evaluation/evaluators/code-evaluator.ts
index c2410924..98081673 100644
--- a/packages/core/src/evaluation/evaluators/code-evaluator.ts
+++ b/packages/core/src/evaluation/evaluators/code-evaluator.ts
@@ -9,6 +9,7 @@ import {
   createTargetProxy,
 } from '../../runtime/target-proxy.js';
 import { toSnakeCaseDeep } from '../case-conversion.js';
+import { type ContentImage, isContentArray } from '../content.js';
 import type { AssertionEntry, JsonObject, TargetAccessConfig } from '../types.js';
 import { clampScore, isNonEmptyString, parseJsonSafe, scoreToVerdict } from './scoring.js';
 import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
@@ -16,6 +17,83 @@ import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
 /** Threshold in bytes above which output is written to a temp file instead of inlined. */
 const FILE_BACKED_OUTPUT_THRESHOLD = 50_000;
 
+/** Regex matching `data:<mediaType>;base64,<data>` URIs. */
+const DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
+
+/**
+ * Convert ContentImage blocks in message arrays for code grader consumption.
+ *
+ * - Data URI images (`data:image/png;base64,...`) → decoded, written to temp file, replaced with file path.
+ * - Non-URI images (already a path or URL) → `source` carried through as `path`.
+ * - ContentText, ContentFile blocks → passed through unchanged.
+ * - Messages with plain string content → passed through unchanged.
+ *
+ * Returns the original array when no image blocks exist (zero-copy fast path).
+ */
+export async function materializeContentForGrader(
+  messages: readonly Record<string, unknown>[] | null | undefined,
+  getWorkDir: () => Promise<string>,
+): Promise<readonly Record<string, unknown>[] | null> {
+  if (!messages || messages.length === 0) return messages ?? null;
+
+  // Fast path: skip if no image blocks exist
+  let hasAnyImage = false;
+  for (const msg of messages) {
+    if (isContentArray(msg.content)) {
+      for (const block of msg.content) {
+        if (block.type === 'image') {
+          hasAnyImage = true;
+          break;
+        }
+      }
+    }
+    if (hasAnyImage) break;
+  }
+  if (!hasAnyImage) return messages;
+
+  let counter = 0;
+  const result: Record<string, unknown>[] = [];
+
+  for (const msg of messages) {
+    if (!isContentArray(msg.content)) {
+      result.push(msg);
+      continue;
+    }
+
+    if (!msg.content.some((b) => b.type === 'image')) {
+      result.push(msg);
+      continue;
+    }
+
+    const blocks: Record<string, unknown>[] = [];
+    for (const block of msg.content) {
+      if (block.type !== 'image') {
+        blocks.push({ ...block });
+        continue;
+      }
+
+      const img = block as ContentImage;
+      const match = DATA_URI_RE.exec(img.source);
+
+      if (match) {
+        const [, mediaType, base64Data] = match;
+        const ext = mediaType.split('/')[1] === 'jpeg' ? 'jpg' : (mediaType.split('/')[1] ?? 'bin');
+        const dir = await getWorkDir();
+        const filePath = join(dir, `img-${counter++}.${ext}`);
+        await writeFile(filePath, Buffer.from(base64Data, 'base64'));
+        blocks.push({ type: 'image', media_type: img.media_type, path: filePath });
+      } else {
+        // Already a path or URL → carry through as path
+        blocks.push({ type: 'image', media_type: img.media_type, path: img.source });
+      }
+    }
+
+    result.push({ ...msg, content: blocks });
+  }
+
+  return result;
+}
+
 export interface CodeEvaluatorOptions {
   readonly command: readonly string[];
   /** @deprecated Use `command` instead */
@@ -46,8 +124,23 @@ export class CodeEvaluator implements Evaluator {
   }
 
   async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
+    // Lazy temp dir for materialized image files
+    let imageTmpDir: string | undefined;
+    const getImageDir = async () => {
+      if (!imageTmpDir) {
+        imageTmpDir = await mkdtemp(join(tmpdir(), 'agentv-img-'));
+      }
+      return imageTmpDir;
+    };
+
+    // Materialize multimodal content (data URIs → temp files, source → path)
+    const materializedOutput = await materializeContentForGrader(
+      context.output as readonly Record<string, unknown>[] | undefined,
+      getImageDir,
+    );
+
     // Determine whether to use file-backed output for large payloads
-    let outputForPayload = context.output ?? null;
+    let outputForPayload: readonly Record<string, unknown>[] | null = materializedOutput;
     let outputPath: string | undefined;
 
     if (outputForPayload) {
@@ -63,11 +156,17 @@ export class CodeEvaluator implements Evaluator {
     // Build payload (camelCase internally, converted to snake_case for graders)
     const payload = {
       criteria: context.evalCase.criteria,
-      expectedOutput: context.evalCase.expected_output,
+      expectedOutput: await materializeContentForGrader(
+        context.evalCase.expected_output as readonly Record<string, unknown>[],
+        getImageDir,
+      ),
       output: outputForPayload,
       outputPath,
       inputFiles: context.evalCase.file_paths,
-      input: context.evalCase.input,
+      input: await materializeContentForGrader(
+        context.evalCase.input as readonly Record<string, unknown>[],
+        getImageDir,
+      ),
       trace: context.trace ?? null,
       tokenUsage: context.tokenUsage ?? null,
       costUsd: context.costUsd ?? null,
@@ -196,6 +295,10 @@ export class CodeEvaluator implements Evaluator {
       if (outputPath) {
         await rm(dirname(outputPath), { recursive: true, force: true }).catch(() => {});
       }
+      // Clean up temp dir for materialized images
+      if (imageTmpDir) {
+        await rm(imageTmpDir, { recursive: true, force: true }).catch(() => {});
+      }
     }
   }
 }