From a3c1a022937d7d7c243c36f1308cb042306c9274 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 02:34:08 +0000 Subject: [PATCH 1/2] feat(core): preserve multimodal content blocks in provider responses Update Claude and Pi providers to preserve non-text content blocks (images) in Message.content instead of discarding them via extractTextContent(). This enables multimodal content to flow from provider response through to evaluators. Changes: - Create shared claude-content.ts with toContentArray() and extractTextContent() used by all 3 Claude providers - Update claude-cli, claude-sdk, claude providers to use structuredContent ?? textContent pattern - Add toPiContentArray() to pi-utils.ts for Pi provider - Update pi-coding-agent convertAgentMessage() to preserve structured content - Add 23 unit tests covering content preservation, backward compat, and end-to-end multimodal flow Text-only responses still produce plain strings (no unnecessary wrapping). extractTextContent() remains available for backward compatibility. Closes #818 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/evaluation/providers/claude-cli.ts | 68 +---- .../evaluation/providers/claude-content.ts | 94 ++++++ .../src/evaluation/providers/claude-sdk.ts | 28 +- .../core/src/evaluation/providers/claude.ts | 28 +- .../evaluation/providers/pi-coding-agent.ts | 5 +- .../core/src/evaluation/providers/pi-utils.ts | 50 +++ .../providers/content-preserve.test.ts | 288 ++++++++++++++++++ 7 files changed, 442 insertions(+), 119 deletions(-) create mode 100644 packages/core/src/evaluation/providers/claude-content.ts create mode 100644 packages/core/test/evaluation/providers/content-preserve.test.ts diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts index d400c2069..1699810dd 100644 --- a/packages/core/src/evaluation/providers/claude-cli.ts +++ b/packages/core/src/evaluation/providers/claude-cli.ts @@ -5,7 +5,7 @@ import type { WriteStream } from 'node:fs'; import { mkdir } from 'node:fs/promises'; import path from 'node:path'; -import type { Content } from '../content.js'; +import { extractTextContent, toContentArray } from './claude-content.js'; import { recordClaudeLogEntry } from './claude-log-tracker.js'; import { buildPromptDocument, normalizeInputFiles } from './preread.js'; import type { ClaudeResolvedConfig } from './targets.js'; @@ -479,72 +479,6 @@ function summarizeEvent(event: Record): string | undefined { } } -/** - * Convert Claude's content array to Content[] preserving non-text blocks. - * Returns undefined if content is a plain string or has only text blocks - * (no benefit over the simpler string representation). - */ -function toContentArray(content: unknown): Content[] | undefined { - if (!Array.isArray(content)) return undefined; - - let hasNonText = false; - const blocks: Content[] = []; - - for (const part of content) { - if (!part || typeof part !== 'object') continue; - const p = part as Record; - - if (p.type === 'text' && typeof p.text === 'string') { - blocks.push({ type: 'text', text: p.text }); - } else if (p.type === 'image' && typeof p.source === 'object' && p.source !== null) { - const src = p.source as Record; - const mediaType = - typeof p.media_type === 'string' - ? p.media_type - : typeof src.media_type === 'string' - ? src.media_type - : 'application/octet-stream'; - const data = - typeof src.data === 'string' - ? `data:${mediaType};base64,${src.data}` - : typeof p.url === 'string' - ? (p.url as string) - : ''; - blocks.push({ type: 'image', media_type: mediaType, source: data }); - hasNonText = true; - } else if (p.type === 'tool_use') { - // tool_use blocks are handled separately as ToolCall — skip - } else if (p.type === 'tool_result') { - // tool_result blocks are not user content — skip - } - } - - return hasNonText && blocks.length > 0 ? blocks : undefined; -} - -/** - * Extract text content from Claude's content array format. - */ -function extractTextContent(content: unknown): string | undefined { - if (typeof content === 'string') { - return content; - } - if (!Array.isArray(content)) { - return undefined; - } - const textParts: string[] = []; - for (const part of content) { - if (!part || typeof part !== 'object') { - continue; - } - const p = part as Record; - if (p.type === 'text' && typeof p.text === 'string') { - textParts.push(p.text); - } - } - return textParts.length > 0 ? textParts.join('\n') : undefined; -} - /** * Extract tool calls from Claude's content array format. */ diff --git a/packages/core/src/evaluation/providers/claude-content.ts b/packages/core/src/evaluation/providers/claude-content.ts new file mode 100644 index 000000000..889029fc9 --- /dev/null +++ b/packages/core/src/evaluation/providers/claude-content.ts @@ -0,0 +1,94 @@ +/** + * Shared content-mapping utilities for Claude-based providers. + * + * Converts Claude's raw content array format (Anthropic API) into the AgentV + * Content[] union so that non-text blocks (images) flow through the pipeline + * without lossy flattening. + * + * Used by: claude-cli, claude-sdk, claude (legacy). + * + * ## Claude content format + * + * Claude responses use: + * ```json + * { "content": [ + * { "type": "text", "text": "..." }, + * { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": "..." } }, + * { "type": "tool_use", "name": "...", "input": {...}, "id": "..." } + * ]} + * ``` + * + * `toContentArray` maps text and image blocks to `Content[]`. + * `tool_use` and `tool_result` blocks are handled separately as `ToolCall`. + */ + +import type { Content } from '../content.js'; + +/** + * Convert Claude's raw content array to `Content[]`, preserving non-text blocks. + * + * Returns `undefined` when the content is a plain string or contains only text + * blocks — callers should fall back to the text-only string representation in + * that case (no benefit from wrapping plain text in `Content[]`). + */ +export function toContentArray(content: unknown): Content[] | undefined { + if (!Array.isArray(content)) return undefined; + + let hasNonText = false; + const blocks: Content[] = []; + + for (const part of content) { + if (!part || typeof part !== 'object') continue; + const p = part as Record; + + if (p.type === 'text' && typeof p.text === 'string') { + blocks.push({ type: 'text', text: p.text }); + } else if (p.type === 'image' && typeof p.source === 'object' && p.source !== null) { + const src = p.source as Record; + const mediaType = + typeof p.media_type === 'string' + ? p.media_type + : typeof src.media_type === 'string' + ? src.media_type + : 'application/octet-stream'; + const data = + typeof src.data === 'string' + ? `data:${mediaType};base64,${src.data}` + : typeof p.url === 'string' + ? (p.url as string) + : ''; + blocks.push({ type: 'image', media_type: mediaType, source: data }); + hasNonText = true; + } else if (p.type === 'tool_use') { + // tool_use blocks are handled separately as ToolCall — skip + } else if (p.type === 'tool_result') { + // tool_result blocks are not user content — skip + } + } + + return hasNonText && blocks.length > 0 ? blocks : undefined; +} + +/** + * Extract text content from Claude's content array format. + * Returns joined text from all `type: 'text'` blocks (newline-separated). + */ +export function extractTextContent(content: unknown): string | undefined { + if (typeof content === 'string') { + return content; + } + if (!Array.isArray(content)) { + return undefined; + } + const textParts: string[] = []; + for (const part of content) { + if (!part || typeof part !== 'object') { + continue; + } + const p = part as Record; + if (p.type === 'text' && typeof p.text === 'string') { + textParts.push(p.text); + } + } + return textParts.length > 0 ? textParts.join('\n') : undefined; +} diff --git a/packages/core/src/evaluation/providers/claude-sdk.ts b/packages/core/src/evaluation/providers/claude-sdk.ts index aab8cc16b..6e8985fa4 100644 --- a/packages/core/src/evaluation/providers/claude-sdk.ts +++ b/packages/core/src/evaluation/providers/claude-sdk.ts @@ -4,6 +4,7 @@ import type { WriteStream } from 'node:fs'; import { mkdir } from 'node:fs/promises'; import path from 'node:path'; +import { extractTextContent, toContentArray } from './claude-content.js'; import { recordClaudeLogEntry } from './claude-log-tracker.js'; import { buildPromptDocument, normalizeInputFiles } from './preread.js'; import type { ClaudeResolvedConfig } from './targets.js'; @@ -139,12 +140,13 @@ export class ClaudeSdkProvider implements Provider { if (betaMessage && typeof betaMessage === 'object') { const msg = betaMessage as Record; const content = msg.content; + const structuredContent = toContentArray(content); const textContent = extractTextContent(content); const toolCalls = extractToolCalls(content); const outputMsg: Message = { role: 'assistant', - content: textContent, + content: structuredContent ?? textContent, toolCalls: toolCalls.length > 0 ? toolCalls : undefined, }; output.push(outputMsg); @@ -280,30 +282,6 @@ export class ClaudeSdkProvider implements Provider { } } -/** - * Extract text content from Claude's content array format. - * Claude uses: content: [{ type: "text", text: "..." }, ...] - */ -function extractTextContent(content: unknown): string | undefined { - if (typeof content === 'string') { - return content; - } - if (!Array.isArray(content)) { - return undefined; - } - const textParts: string[] = []; - for (const part of content) { - if (!part || typeof part !== 'object') { - continue; - } - const p = part as Record; - if (p.type === 'text' && typeof p.text === 'string') { - textParts.push(p.text); - } - } - return textParts.length > 0 ? textParts.join('\n') : undefined; -} - /** * Extract tool calls from Claude's content array format. * Claude uses: content: [{ type: "tool_use", name: "...", input: {...}, id: "..." }, ...] diff --git a/packages/core/src/evaluation/providers/claude.ts b/packages/core/src/evaluation/providers/claude.ts index 62382a604..2ac222e4f 100644 --- a/packages/core/src/evaluation/providers/claude.ts +++ b/packages/core/src/evaluation/providers/claude.ts @@ -4,6 +4,7 @@ import type { WriteStream } from 'node:fs'; import { mkdir } from 'node:fs/promises'; import path from 'node:path'; +import { extractTextContent, toContentArray } from './claude-content.js'; import { recordClaudeLogEntry } from './claude-log-tracker.js'; import { buildPromptDocument, normalizeInputFiles } from './preread.js'; import type { ClaudeResolvedConfig } from './targets.js'; @@ -139,12 +140,13 @@ export class ClaudeProvider implements Provider { if (betaMessage && typeof betaMessage === 'object') { const msg = betaMessage as Record; const content = msg.content; + const structuredContent = toContentArray(content); const textContent = extractTextContent(content); const toolCalls = extractToolCalls(content); const outputMsg: Message = { role: 'assistant', - content: textContent, + content: structuredContent ?? textContent, toolCalls: toolCalls.length > 0 ? toolCalls : undefined, }; output.push(outputMsg); @@ -278,30 +280,6 @@ export class ClaudeProvider implements Provider { } } -/** - * Extract text content from Claude's content array format. - * Claude uses: content: [{ type: "text", text: "..." }, ...] - */ -function extractTextContent(content: unknown): string | undefined { - if (typeof content === 'string') { - return content; - } - if (!Array.isArray(content)) { - return undefined; - } - const textParts: string[] = []; - for (const part of content) { - if (!part || typeof part !== 'object') { - continue; - } - const p = part as Record; - if (p.type === 'text' && typeof p.text === 'string') { - textParts.push(p.text); - } - } - return textParts.length > 0 ? textParts.join('\n') : undefined; -} - /** * Extract tool calls from Claude's content array format. * Claude uses: content: [{ type: "tool_use", name: "...", input: {...}, id: "..." }, ...] diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index 1c92f92cd..3e4691bd0 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -18,7 +18,7 @@ import { createInterface } from 'node:readline'; import { fileURLToPath } from 'node:url'; import { recordPiLogEntry } from './pi-log-tracker.js'; -import { extractPiTextContent, toFiniteNumber } from './pi-utils.js'; +import { extractPiTextContent, toFiniteNumber, toPiContentArray } from './pi-utils.js'; import { normalizeInputFiles } from './preread.js'; import type { PiCodingAgentResolvedConfig } from './targets.js'; import type { @@ -564,7 +564,8 @@ function convertAgentMessage( const msg = message as Record; const role = typeof msg.role === 'string' ? msg.role : 'unknown'; - const content = extractPiTextContent(msg.content); + const structuredContent = toPiContentArray(msg.content); + const content = structuredContent ?? extractPiTextContent(msg.content); const toolCalls = extractToolCalls(msg.content, toolTrackers, completedToolResults); const startTimeVal = typeof msg.timestamp === 'number' diff --git a/packages/core/src/evaluation/providers/pi-utils.ts b/packages/core/src/evaluation/providers/pi-utils.ts index 058720870..3ea78d3d8 100644 --- a/packages/core/src/evaluation/providers/pi-utils.ts +++ b/packages/core/src/evaluation/providers/pi-utils.ts @@ -5,6 +5,8 @@ * and safe numeric conversions. */ +import type { Content } from '../content.js'; + /** * Extract text content from Pi's content array format. * Pi uses: content: [{ type: "text", text: "..." }, ...] @@ -32,6 +34,54 @@ export function extractPiTextContent(content: unknown): string | undefined { return textParts.length > 0 ? textParts.join('\n') : undefined; } +/** + * Convert Pi's content array to `Content[]`, preserving non-text blocks. + * + * Returns `undefined` when content is a plain string or contains only text + * blocks — callers should fall back to the text-only string representation. + */ +export function toPiContentArray(content: unknown): Content[] | undefined { + if (!Array.isArray(content)) return undefined; + + let hasNonText = false; + const blocks: Content[] = []; + + for (const part of content) { + if (!part || typeof part !== 'object') continue; + const p = part as Record; + + if (p.type === 'text' && typeof p.text === 'string') { + blocks.push({ type: 'text', text: p.text }); + } else if (p.type === 'image') { + const mediaType = + typeof p.media_type === 'string' ? p.media_type : 'application/octet-stream'; + + let source = ''; + if (typeof p.source === 'object' && p.source !== null) { + const src = p.source as Record; + const srcMediaType = + typeof src.media_type === 'string' ? src.media_type : mediaType; + source = + typeof src.data === 'string' + ? `data:${srcMediaType};base64,${src.data}` + : ''; + } + if (!source && typeof p.url === 'string') { + source = p.url; + } + + if (source) { + blocks.push({ type: 'image', media_type: mediaType, source }); + hasNonText = true; + } + } else if (p.type === 'tool_use' || p.type === 'tool_result') { + // Handled separately — skip + } + } + + return hasNonText && blocks.length > 0 ? blocks : undefined; +} + /** * Safely convert an unknown value to a finite number, or undefined. */ diff --git a/packages/core/test/evaluation/providers/content-preserve.test.ts b/packages/core/test/evaluation/providers/content-preserve.test.ts new file mode 100644 index 000000000..626ed0e35 --- /dev/null +++ b/packages/core/test/evaluation/providers/content-preserve.test.ts @@ -0,0 +1,288 @@ +import { describe, expect, it } from 'vitest'; + +import { getTextContent } from '../../../src/evaluation/content.js'; +import { + extractTextContent, + toContentArray, +} from '../../../src/evaluation/providers/claude-content.js'; +import { + extractPiTextContent, + toPiContentArray, +} from '../../../src/evaluation/providers/pi-utils.js'; +import type { Content } from '../../../src/evaluation/content.js'; +import type { Message } from '../../../src/evaluation/providers/types.js'; + +// --------------------------------------------------------------------------- +// toContentArray (Claude) +// --------------------------------------------------------------------------- +describe('toContentArray', () => { + it('returns undefined for non-array input', () => { + expect(toContentArray('plain string')).toBeUndefined(); + expect(toContentArray(42)).toBeUndefined(); + expect(toContentArray(null)).toBeUndefined(); + expect(toContentArray(undefined)).toBeUndefined(); + }); + + it('returns undefined when content has only text blocks', () => { + const content = [ + { type: 'text', text: 'hello' }, + { type: 'text', text: 'world' }, + ]; + expect(toContentArray(content)).toBeUndefined(); + }); + + it('preserves image + text with base64 data', () => { + const content = [ + { type: 'text', text: 'Here is an image:' }, + { + type: 'image', + source: { type: 'base64', media_type: 'image/png', data: 'abc123' }, + }, + ]; + const result = toContentArray(content); + expect(result).toBeDefined(); + expect(result).toHaveLength(2); + expect(result![0]).toEqual({ type: 'text', text: 'Here is an image:' }); + expect(result![1]).toEqual({ + type: 'image', + media_type: 'image/png', + source: 'data:image/png;base64,abc123', + }); + }); + + it('handles url images', () => { + const content = [ + { + type: 'image', + url: 'https://example.com/img.png', + source: { type: 'url' }, + media_type: 'image/png', + }, + ]; + const result = toContentArray(content); + expect(result).toBeDefined(); + expect(result![0]).toEqual({ + type: 'image', + media_type: 'image/png', + source: 'https://example.com/img.png', + }); + }); + + it('skips tool_use and tool_result blocks', () => { + const content = [ + { type: 'text', text: 'hi' }, + { type: 'tool_use', name: 'bash', input: { cmd: 'ls' }, id: 't1' }, + { type: 'tool_result', tool_use_id: 't1', content: 'ok' }, + { + type: 'image', + source: { data: 'AAAA', media_type: 'image/jpeg' }, + }, + ]; + const result = toContentArray(content); + expect(result).toBeDefined(); + expect(result).toHaveLength(2); + expect(result![0]).toEqual({ type: 'text', text: 'hi' }); + expect(result![1].type).toBe('image'); + }); + + it('handles invalid parts gracefully', () => { + const content = [null, undefined, 42, 'string', { type: 'text', text: 'ok' }]; + // only text → undefined (no non-text blocks) + expect(toContentArray(content)).toBeUndefined(); + }); +}); + +// --------------------------------------------------------------------------- +// extractTextContent (Claude) +// --------------------------------------------------------------------------- +describe('extractTextContent', () => { + it('passes through a plain string', () => { + expect(extractTextContent('hello')).toBe('hello'); + }); + + it('returns undefined for non-array non-string', () => { + expect(extractTextContent(42)).toBeUndefined(); + expect(extractTextContent(null)).toBeUndefined(); + expect(extractTextContent(undefined)).toBeUndefined(); + expect(extractTextContent({})).toBeUndefined(); + }); + + it('extracts text from content array', () => { + const content = [ + { type: 'text', text: 'hello' }, + { type: 'text', text: 'world' }, + ]; + expect(extractTextContent(content)).toBe('hello\nworld'); + }); + + it('skips non-text blocks', () => { + const content = [ + { type: 'text', text: 'hello' }, + { type: 'image', source: { data: 'abc' } }, + { type: 'tool_use', name: 'bash' }, + ]; + expect(extractTextContent(content)).toBe('hello'); + }); + + it('returns undefined for empty array', () => { + expect(extractTextContent([])).toBeUndefined(); + }); +}); + +// --------------------------------------------------------------------------- +// toPiContentArray +// --------------------------------------------------------------------------- +describe('toPiContentArray', () => { + it('returns undefined for non-array input', () => { + expect(toPiContentArray('plain string')).toBeUndefined(); + expect(toPiContentArray(42)).toBeUndefined(); + expect(toPiContentArray(null)).toBeUndefined(); + }); + + it('returns undefined when content has only text blocks', () => { + const content = [ + { type: 'text', text: 'hello' }, + { type: 'text', text: 'world' }, + ]; + expect(toPiContentArray(content)).toBeUndefined(); + }); + + it('preserves image + text with base64 source', () => { + const content = [ + { type: 'text', text: 'Here is an image:' }, + { + type: 'image', + media_type: 'image/png', + source: { data: 'abc123', media_type: 'image/png' }, + }, + ]; + const result = toPiContentArray(content); + expect(result).toBeDefined(); + expect(result).toHaveLength(2); + expect(result![0]).toEqual({ type: 'text', text: 'Here is an image:' }); + expect(result![1]).toEqual({ + type: 'image', + media_type: 'image/png', + source: 'data:image/png;base64,abc123', + }); + }); + + it('handles url images', () => { + const content = [ + { + type: 'image', + url: 'https://example.com/img.png', + media_type: 'image/png', + }, + ]; + const result = toPiContentArray(content); + expect(result).toBeDefined(); + expect(result![0]).toEqual({ + type: 'image', + media_type: 'image/png', + source: 'https://example.com/img.png', + }); + }); + + it('skips tool_use and tool_result blocks', () => { + const content = [ + { type: 'text', text: 'hi' }, + { type: 'tool_use', name: 'bash' }, + { type: 'tool_result', content: 'ok' }, + { + type: 'image', + media_type: 'image/jpeg', + source: { data: 'AAAA', media_type: 'image/jpeg' }, + }, + ]; + const result = toPiContentArray(content); + expect(result).toBeDefined(); + expect(result).toHaveLength(2); + expect(result![0]).toEqual({ type: 'text', text: 'hi' }); + expect(result![1].type).toBe('image'); + }); +}); + +// --------------------------------------------------------------------------- +// extractPiTextContent (backward compat) +// --------------------------------------------------------------------------- +describe('extractPiTextContent', () => { + it('passes through a plain string', () => { + expect(extractPiTextContent('hello')).toBe('hello'); + }); + + it('extracts text from content array', () => { + const content = [ + { type: 'text', text: 'hello' }, + { type: 'text', text: 'world' }, + ]; + expect(extractPiTextContent(content)).toBe('hello\nworld'); + }); + + it('returns undefined for non-array non-string', () => { + expect(extractPiTextContent(42)).toBeUndefined(); + expect(extractPiTextContent(null)).toBeUndefined(); + }); + + it('returns undefined for empty array', () => { + expect(extractPiTextContent([])).toBeUndefined(); + }); +}); + +// --------------------------------------------------------------------------- +// End-to-end: Content[] interop +// --------------------------------------------------------------------------- +describe('End-to-end content preservation', () => { + it('Content[] is compatible with getTextContent', () => { + const blocks: Content[] = [ + { type: 'text', text: 'hello' }, + { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,abc' }, + { type: 'text', text: 'world' }, + ]; + expect(getTextContent(blocks)).toBe('hello\nworld'); + }); + + it('image block survives into Message.content', () => { + const rawClaudeContent = [ + { type: 'text', text: 'Look at this:' }, + { + type: 'image', + source: { type: 'base64', media_type: 'image/png', data: 'DEADBEEF' }, + }, + ]; + + const structuredContent = toContentArray(rawClaudeContent); + const textContent = extractTextContent(rawClaudeContent); + + const msg: Message = { + role: 'assistant', + content: structuredContent ?? textContent, + }; + + // content should be Content[] (not flattened to string) + expect(Array.isArray(msg.content)).toBe(true); + const blocks = msg.content as Content[]; + expect(blocks).toHaveLength(2); + expect(blocks[0]).toEqual({ type: 'text', text: 'Look at this:' }); + expect(blocks[1].type).toBe('image'); + expect((blocks[1] as { source: string }).source).toContain('base64,DEADBEEF'); + }); + + it('text-only content falls back to string', () => { + const rawClaudeContent = [ + { type: 'text', text: 'Just text' }, + ]; + + const structuredContent = toContentArray(rawClaudeContent); + const textContent = extractTextContent(rawClaudeContent); + + const msg: Message = { + role: 'assistant', + content: structuredContent ?? textContent, + }; + + // text-only → toContentArray returns undefined → falls back to string + expect(typeof msg.content).toBe('string'); + expect(msg.content).toBe('Just text'); + }); +}); From 87cce5b3f73eafb9da822a30e27184395e796cad Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 03:07:11 +0000 Subject: [PATCH 2/2] =?UTF-8?q?feat(eval):=20code=20grader=20multimodal=20?= =?UTF-8?q?=E2=80=94=20structured=20Content=20in=20CodeGraderInput?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add ContentTextSchema, ContentImageSchema, ContentFileSchema, ContentSchema as Zod discriminated union in packages/eval/src/schemas.ts - Update MessageSchema.content to accept string | Content[] (typed blocks) - Add materializeContentForGrader() in code-evaluator.ts: - Data URI images decoded and written to temp files (path, not base64) - Non-URI images pass source through as path field - Text/file blocks unchanged; string content unchanged - Lazy temp dir creation for image files, cleaned up in finally block - Export Content schemas and types from @agentv/eval - Add comprehensive unit tests for schema validation and materialization - Add integration tests for CodeEvaluator with multimodal output Closes #821 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../evaluation/evaluators/code-evaluator.ts | 109 +++++- .../code-evaluator-multimodal.test.ts | 338 ++++++++++++++++++ packages/eval/src/index.ts | 8 + packages/eval/src/schemas.ts | 60 +++- packages/eval/test/define-code-grader.test.ts | 162 +++++++++ 5 files changed, 673 insertions(+), 4 deletions(-) create mode 100644 packages/core/test/evaluation/code-evaluator-multimodal.test.ts diff --git a/packages/core/src/evaluation/evaluators/code-evaluator.ts b/packages/core/src/evaluation/evaluators/code-evaluator.ts index c2410924b..980816731 100644 --- a/packages/core/src/evaluation/evaluators/code-evaluator.ts +++ b/packages/core/src/evaluation/evaluators/code-evaluator.ts @@ -9,6 +9,7 @@ import { createTargetProxy, } from '../../runtime/target-proxy.js'; import { toSnakeCaseDeep } from '../case-conversion.js'; +import { type ContentImage, isContentArray } from '../content.js'; import type { AssertionEntry, JsonObject, TargetAccessConfig } from '../types.js'; import { clampScore, isNonEmptyString, parseJsonSafe, scoreToVerdict } from './scoring.js'; import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; @@ -16,6 +17,83 @@ import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; /** Threshold in bytes above which output is written to a temp file instead of inlined. */ const FILE_BACKED_OUTPUT_THRESHOLD = 50_000; +/** Regex matching `data:;base64,` URIs. */ +const DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s; + +/** + * Convert ContentImage blocks in message arrays for code grader consumption. + * + * - Data URI images (`data:image/png;base64,...`) → decoded, written to temp file, replaced with file path. + * - Non-URI images (already a path or URL) → `source` carried through as `path`. + * - ContentText, ContentFile blocks → passed through unchanged. + * - Messages with plain string content → passed through unchanged. + * + * Returns the original array when no image blocks exist (zero-copy fast path). + */ +export async function materializeContentForGrader( + messages: readonly Record[] | null | undefined, + getWorkDir: () => Promise, +): Promise[] | null> { + if (!messages || messages.length === 0) return messages ?? null; + + // Fast path: skip if no image blocks exist + let hasAnyImage = false; + for (const msg of messages) { + if (isContentArray(msg.content)) { + for (const block of msg.content) { + if (block.type === 'image') { + hasAnyImage = true; + break; + } + } + } + if (hasAnyImage) break; + } + if (!hasAnyImage) return messages; + + let counter = 0; + const result: Record[] = []; + + for (const msg of messages) { + if (!isContentArray(msg.content)) { + result.push(msg); + continue; + } + + if (!msg.content.some((b) => b.type === 'image')) { + result.push(msg); + continue; + } + + const blocks: Record[] = []; + for (const block of msg.content) { + if (block.type !== 'image') { + blocks.push({ ...block }); + continue; + } + + const img = block as ContentImage; + const match = DATA_URI_RE.exec(img.source); + + if (match) { + const [, mediaType, base64Data] = match; + const ext = mediaType.split('/')[1] === 'jpeg' ? 'jpg' : (mediaType.split('/')[1] ?? 'bin'); + const dir = await getWorkDir(); + const filePath = join(dir, `img-${counter++}.${ext}`); + await writeFile(filePath, Buffer.from(base64Data, 'base64')); + blocks.push({ type: 'image', media_type: img.media_type, path: filePath }); + } else { + // Already a path or URL → carry through as path + blocks.push({ type: 'image', media_type: img.media_type, path: img.source }); + } + } + + result.push({ ...msg, content: blocks }); + } + + return result; +} + export interface CodeEvaluatorOptions { readonly command: readonly string[]; /** @deprecated Use `command` instead */ @@ -46,8 +124,23 @@ export class CodeEvaluator implements Evaluator { } async evaluate(context: EvaluationContext): Promise { + // Lazy temp dir for materialized image files + let imageTmpDir: string | undefined; + const getImageDir = async () => { + if (!imageTmpDir) { + imageTmpDir = await mkdtemp(join(tmpdir(), 'agentv-img-')); + } + return imageTmpDir; + }; + + // Materialize multimodal content (data URIs → temp files, source → path) + const materializedOutput = await materializeContentForGrader( + context.output as readonly Record[] | undefined, + getImageDir, + ); + // Determine whether to use file-backed output for large payloads - let outputForPayload = context.output ?? null; + let outputForPayload: readonly Record[] | null = materializedOutput; let outputPath: string | undefined; if (outputForPayload) { @@ -63,11 +156,17 @@ export class CodeEvaluator implements Evaluator { // Build payload (camelCase internally, converted to snake_case for graders) const payload = { criteria: context.evalCase.criteria, - expectedOutput: context.evalCase.expected_output, + expectedOutput: await materializeContentForGrader( + context.evalCase.expected_output as readonly Record[], + getImageDir, + ), output: outputForPayload, outputPath, inputFiles: context.evalCase.file_paths, - input: context.evalCase.input, + input: await materializeContentForGrader( + context.evalCase.input as readonly Record[], + getImageDir, + ), trace: context.trace ?? null, tokenUsage: context.tokenUsage ?? null, costUsd: context.costUsd ?? null, @@ -196,6 +295,10 @@ export class CodeEvaluator implements Evaluator { if (outputPath) { await rm(dirname(outputPath), { recursive: true, force: true }).catch(() => {}); } + // Clean up temp dir for materialized images + if (imageTmpDir) { + await rm(imageTmpDir, { recursive: true, force: true }).catch(() => {}); + } } } } diff --git a/packages/core/test/evaluation/code-evaluator-multimodal.test.ts b/packages/core/test/evaluation/code-evaluator-multimodal.test.ts new file mode 100644 index 000000000..29f14b75b --- /dev/null +++ b/packages/core/test/evaluation/code-evaluator-multimodal.test.ts @@ -0,0 +1,338 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { existsSync, readFileSync, readdirSync } from 'node:fs'; +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +import { materializeContentForGrader } from '../../src/evaluation/evaluators/code-evaluator.js'; +import { CodeEvaluator } from '../../src/evaluation/evaluators/code-evaluator.js'; +import type { EvalTest } from '../../src/evaluation/types.js'; + +const baseTestCase: EvalTest = { + id: 'case-mm', + dataset: 'test-dataset', + question: 'Test question', + input: [{ role: 'user', content: 'Describe this image' }], + expected_output: [], + reference_answer: 'A chart', + file_paths: [], + criteria: 'Describes the image correctly', + evaluator: 'code-grader', +}; + +/** Encode a string as base64 data URI. */ +function toDataUri(mediaType: string, data: string): string { + return `data:${mediaType};base64,${Buffer.from(data).toString('base64')}`; +} + +/** Create a grader script that echoes the parsed payload back as JSON. */ +async function createPayloadEchoGrader(dir: string): Promise { + const script = join(dir, 'echo-grader.js'); + await writeFile( + script, + `const input = require('fs').readFileSync(0, 'utf8'); +const payload = JSON.parse(input); +console.log(JSON.stringify({ + score: 1.0, + assertions: [{ text: 'ok', passed: true }], + details: { payload }, +})); +`, + 'utf8', + ); + return [process.execPath, script]; +} + +describe('materializeContentForGrader', () => { + let tmpDir: string; + + beforeEach(async () => { + tmpDir = await mkdtemp(join(tmpdir(), 'materialize-test-')); + }); + + afterEach(async () => { + await rm(tmpDir, { recursive: true, force: true }); + }); + + const getWorkDir = () => Promise.resolve(tmpDir); + + it('returns null for null input', async () => { + const result = await materializeContentForGrader(null, getWorkDir); + expect(result).toBeNull(); + }); + + it('returns null for undefined input', async () => { + const result = await materializeContentForGrader(undefined, getWorkDir); + expect(result).toBeNull(); + }); + + it('passes through text-only messages unchanged', async () => { + const messages = [ + { role: 'user', content: 'Hello' }, + { role: 'assistant', content: 'Hi there' }, + ] as Record[]; + + const result = await materializeContentForGrader(messages, getWorkDir); + expect(result).toBe(messages); // Same reference — zero-copy + }); + + it('passes through Content[] with only text blocks unchanged', async () => { + const messages = [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'paragraph 1' }, + { type: 'text', text: 'paragraph 2' }, + ], + }, + ] as Record[]; + + const result = await materializeContentForGrader(messages, getWorkDir); + expect(result).toBe(messages); // Same reference — no images + }); + + it('converts ContentImage data URI to temp file path', async () => { + const imageData = 'fake-png-data-for-testing'; + const dataUri = toDataUri('image/png', imageData); + + const messages = [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'Here is a chart:' }, + { type: 'image', media_type: 'image/png', source: dataUri }, + ], + }, + ] as Record[]; + + const result = await materializeContentForGrader(messages, getWorkDir); + expect(result).not.toBe(messages); // New array — content was transformed + + const content = (result?.[0] as Record).content as Record[]; + expect(content).toHaveLength(2); + + // Text block preserved + expect(content[0]).toEqual({ type: 'text', text: 'Here is a chart:' }); + + // Image block converted to path + const imgBlock = content[1]; + expect(imgBlock.type).toBe('image'); + expect(imgBlock.media_type).toBe('image/png'); + expect(typeof imgBlock.path).toBe('string'); + expect(imgBlock.path).toContain('img-0.png'); + expect(imgBlock).not.toHaveProperty('source'); + + // Verify file was written with correct content + const filePath = imgBlock.path as string; + expect(existsSync(filePath)).toBe(true); + const fileContent = readFileSync(filePath); + expect(fileContent.toString()).toBe(imageData); + }); + + it('converts ContentImage path/URL source to path field', async () => { + const messages = [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'Chart:' }, + { type: 'image', media_type: 'image/png', source: '/workspace/chart.png' }, + ], + }, + ] as Record[]; + + const result = await materializeContentForGrader(messages, getWorkDir); + const content = (result?.[0] as Record).content as Record[]; + const imgBlock = content[1]; + + expect(imgBlock.type).toBe('image'); + expect(imgBlock.media_type).toBe('image/png'); + expect(imgBlock.path).toBe('/workspace/chart.png'); + expect(imgBlock).not.toHaveProperty('source'); + }); + + it('handles JPEG media type extension correctly', async () => { + const dataUri = toDataUri('image/jpeg', 'fake-jpeg'); + const messages = [ + { + role: 'assistant', + content: [{ type: 'image', media_type: 'image/jpeg', source: dataUri }], + }, + ] as Record[]; + + const result = await materializeContentForGrader(messages, getWorkDir); + const content = (result?.[0] as Record).content as Record[]; + expect(content[0].path as string).toContain('.jpg'); + }); + + it('preserves non-content message fields', async () => { + const dataUri = toDataUri('image/png', 'data'); + const messages = [ + { + role: 'assistant', + content: [{ type: 'image', media_type: 'image/png', source: dataUri }], + toolCalls: [{ tool: 'screenshot', input: {} }], + metadata: { provider: 'test' }, + }, + ] as Record[]; + + const result = await materializeContentForGrader(messages, getWorkDir); + const msg = result?.[0] as Record; + expect(msg.role).toBe('assistant'); + expect(msg.toolCalls).toEqual([{ tool: 'screenshot', input: {} }]); + expect(msg.metadata).toEqual({ provider: 'test' }); + }); + + it('handles multiple images across multiple messages', async () => { + const uri1 = toDataUri('image/png', 'image1'); + const uri2 = toDataUri('image/webp', 'image2'); + + const messages = [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'First chart:' }, + { type: 'image', media_type: 'image/png', source: uri1 }, + ], + }, + { + role: 'assistant', + content: [ + { type: 'text', text: 'Second chart:' }, + { type: 'image', media_type: 'image/webp', source: uri2 }, + ], + }, + ] as Record[]; + + const result = await materializeContentForGrader(messages, getWorkDir); + expect(result).toHaveLength(2); + + const content0 = (result?.[0] as Record).content as Record[]; + const content1 = (result?.[1] as Record).content as Record[]; + + expect(content0[1].path as string).toContain('img-0.png'); + expect(content1[1].path as string).toContain('img-1.webp'); + + // Both files exist + expect(existsSync(content0[1].path as string)).toBe(true); + expect(existsSync(content1[1].path as string)).toBe(true); + }); + + it('preserves ContentFile blocks unchanged', async () => { + const dataUri = toDataUri('image/png', 'data'); + const messages = [ + { + role: 'assistant', + content: [ + { type: 'file', media_type: 'text/csv', path: '/workspace/data.csv' }, + { type: 'image', media_type: 'image/png', source: dataUri }, + ], + }, + ] as Record[]; + + const result = await materializeContentForGrader(messages, getWorkDir); + const content = (result?.[0] as Record).content as Record[]; + + // File block preserved exactly + expect(content[0]).toEqual({ + type: 'file', + media_type: 'text/csv', + path: '/workspace/data.csv', + }); + // Image block converted + expect(content[1].type).toBe('image'); + expect(typeof content[1].path).toBe('string'); + }); +}); + +describe('CodeEvaluator multimodal integration', () => { + let tmpDir: string; + + beforeEach(async () => { + tmpDir = await mkdtemp(join(tmpdir(), 'code-eval-mm-')); + }); + + afterEach(async () => { + await rm(tmpDir, { recursive: true, force: true }); + }); + + it('sends text-only output unchanged to grader', async () => { + const command = await createPayloadEchoGrader(tmpDir); + const output = [{ role: 'assistant' as const, content: 'Hello world' }]; + + const evaluator = new CodeEvaluator({ command }); + const result = await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'answer', + output, + }); + + expect(result.score).toBe(1.0); + const details = result.details as Record; + const payload = details.payload as Record; + const outputMsgs = payload.output as Record[]; + expect(outputMsgs[0].content).toBe('Hello world'); + }); + + it('materializes image data URIs in output for grader', async () => { + const command = await createPayloadEchoGrader(tmpDir); + const imageData = 'test-image-bytes'; + const dataUri = toDataUri('image/png', imageData); + + const output = [ + { + role: 'assistant' as const, + content: [ + { type: 'text' as const, text: 'Generated chart:' }, + { type: 'image' as const, media_type: 'image/png', source: dataUri }, + ], + }, + ]; + + const evaluator = new CodeEvaluator({ command }); + const result = await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'answer', + output, + }); + + expect(result.score).toBe(1.0); + + // Verify the grader received the payload with image paths (not data URIs) + const details = result.details as Record; + const payload = details.payload as Record; + const outputMsgs = payload.output as Record[]; + const content = outputMsgs[0].content as Record[]; + + // Text block preserved + expect(content[0]).toEqual({ type: 'text', text: 'Generated chart:' }); + + // Image block has path, not source + expect(content[1].type).toBe('image'); + expect(content[1].media_type).toBe('image/png'); + expect(typeof content[1].path).toBe('string'); + expect(content[1]).not.toHaveProperty('source'); + }); + + it('cleans up materialized image temp files after grading', async () => { + const command = await createPayloadEchoGrader(tmpDir); + const dataUri = toDataUri('image/png', 'cleanup-test'); + + const output = [ + { + role: 'assistant' as const, + content: [{ type: 'image' as const, media_type: 'image/png', source: dataUri }], + }, + ]; + + const evaluator = new CodeEvaluator({ command }); + await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'answer', + output, + }); + + // Image temp dirs should be cleaned up after evaluation + const agentVImgDirs = readdirSync(tmpdir()).filter((d) => d.startsWith('agentv-img-')); + // Can't assert zero (concurrent tests), but the cleanup logic was exercised + }); +}); diff --git a/packages/eval/src/index.ts b/packages/eval/src/index.ts index c814b698d..2eff5be90 100644 --- a/packages/eval/src/index.ts +++ b/packages/eval/src/index.ts @@ -43,6 +43,10 @@ export { ToolCallSchema, TokenUsageSchema, PromptTemplateInputSchema, + ContentTextSchema, + ContentImageSchema, + ContentFileSchema, + ContentSchema, type CodeGraderInput, type CodeGraderResult, type TraceSummary, @@ -50,6 +54,10 @@ export { type ToolCall, type TokenUsage, type PromptTemplateInput, + type ContentText, + type ContentImage, + type ContentFile, + type Content, } from './schemas.js'; // Re-export target client diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index 3385ac5dd..658ae1bde 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -1,6 +1,22 @@ /** * Zod schemas for code grader input/output validation. * Provides both compile-time types and runtime validation. + * + * ## Content model + * + * `Message.content` accepts `string | Content[]`: + * - `string` — backward-compatible plain text (most common case) + * - `Content[]` — typed content blocks for multimodal messages + * + * Content variants: + * - `ContentText` — `{ type: 'text', text: string }` + * - `ContentImage` — `{ type: 'image', media_type: string, path: string }` (file path, not base64) + * - `ContentFile` — `{ type: 'file', media_type: string, path: string }` + * + * To add a new content variant: + * 1. Define a new Zod schema with a unique `type` literal + * 2. Add it to `ContentSchema` discriminated union + * 3. Re-export from `index.ts` */ import { z } from 'zod'; @@ -37,12 +53,49 @@ export const ToolCallSchema = z.object({ durationMs: z.number().optional(), }); +// --------------------------------------------------------------------------- +// Content block schemas (discriminated union on `type`) +// --------------------------------------------------------------------------- + +/** Text content block. */ +export const ContentTextSchema = z.object({ + type: z.literal('text'), + text: z.string(), +}); + +/** + * Image content block. + * `path` is a filesystem path — never inline base64. + */ +export const ContentImageSchema = z.object({ + type: z.literal('image'), + media_type: z.string(), + path: z.string(), +}); + +/** File content block. */ +export const ContentFileSchema = z.object({ + type: z.literal('file'), + media_type: z.string(), + path: z.string(), +}); + +/** Discriminated union of all content block types. */ +export const ContentSchema = z.discriminatedUnion('type', [ + ContentTextSchema, + ContentImageSchema, + ContentFileSchema, +]); + /** * Unified message schema for input, expected, and output messages. + * + * `content` is either a plain string or a `Content[]` array of typed blocks. + * Use `getTextContent()` from `@agentv/core` to extract plain text from either form. */ export const MessageSchema = z.object({ role: z.enum(['assistant', 'user', 'system', 'tool']), - content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]).optional(), + content: z.union([z.string(), z.array(ContentSchema)]).optional(), toolCalls: z.array(ToolCallSchema).optional(), name: z.string().optional(), startTime: z.string().optional(), @@ -106,6 +159,11 @@ export type Message = z.infer; export type ToolCall = z.infer; export type TokenUsage = z.infer; +export type ContentText = z.infer; +export type ContentImage = z.infer; +export type ContentFile = z.infer; +export type Content = z.infer; + /** * Prompt template input schema (camelCase, converted from snake_case wire format). * Uses the same schema as CodeGraderInput since the orchestrator sends identical payloads. diff --git a/packages/eval/test/define-code-grader.test.ts b/packages/eval/test/define-code-grader.test.ts index 67a77e878..2db1be399 100644 --- a/packages/eval/test/define-code-grader.test.ts +++ b/packages/eval/test/define-code-grader.test.ts @@ -7,8 +7,132 @@ import { // Backward-compat aliases CodeJudgeInputSchema, CodeJudgeResultSchema, + ContentFileSchema, + ContentImageSchema, + ContentSchema, + ContentTextSchema, + MessageSchema, } from '../src/schemas.js'; +// --------------------------------------------------------------------------- +// Content schemas +// --------------------------------------------------------------------------- + +describe('ContentSchema', () => { + it('parses ContentText', () => { + const result = ContentTextSchema.parse({ type: 'text', text: 'hello' }); + expect(result).toEqual({ type: 'text', text: 'hello' }); + }); + + it('parses ContentImage with path', () => { + const result = ContentImageSchema.parse({ + type: 'image', + media_type: 'image/png', + path: '/workspace/chart.png', + }); + expect(result).toEqual({ + type: 'image', + media_type: 'image/png', + path: '/workspace/chart.png', + }); + }); + + it('parses ContentFile', () => { + const result = ContentFileSchema.parse({ + type: 'file', + media_type: 'text/csv', + path: '/workspace/data.csv', + }); + expect(result).toEqual({ type: 'file', media_type: 'text/csv', path: '/workspace/data.csv' }); + }); + + it('discriminated union resolves correct variant', () => { + const text = ContentSchema.parse({ type: 'text', text: 'hi' }); + expect(text.type).toBe('text'); + + const image = ContentSchema.parse({ + type: 'image', + media_type: 'image/jpeg', + path: '/img.jpg', + }); + expect(image.type).toBe('image'); + + const file = ContentSchema.parse({ + type: 'file', + media_type: 'application/pdf', + path: '/doc.pdf', + }); + expect(file.type).toBe('file'); + }); + + it('rejects unknown content type', () => { + expect(() => ContentSchema.parse({ type: 'audio', data: '...' })).toThrow(); + }); +}); + +// --------------------------------------------------------------------------- +// MessageSchema with Content[] +// --------------------------------------------------------------------------- + +describe('MessageSchema content variants', () => { + it('accepts string content (backward compat)', () => { + const msg = MessageSchema.parse({ role: 'assistant', content: 'Hello' }); + expect(msg.content).toBe('Hello'); + }); + + it('accepts Content[] with text blocks', () => { + const msg = MessageSchema.parse({ + role: 'assistant', + content: [ + { type: 'text', text: 'paragraph 1' }, + { type: 'text', text: 'paragraph 2' }, + ], + }); + expect(Array.isArray(msg.content)).toBe(true); + expect(msg.content as unknown[]).toHaveLength(2); + }); + + it('accepts Content[] with image blocks', () => { + const msg = MessageSchema.parse({ + role: 'assistant', + content: [ + { type: 'text', text: 'Chart:' }, + { type: 'image', media_type: 'image/png', path: '/chart.png' }, + ], + }); + const content = msg.content as { type: string }[]; + expect(content[1].type).toBe('image'); + }); + + it('accepts Content[] with file blocks', () => { + const msg = MessageSchema.parse({ + role: 'assistant', + content: [{ type: 'file', media_type: 'text/csv', path: '/data.csv' }], + }); + const content = msg.content as { type: string }[]; + expect(content[0].type).toBe('file'); + }); + + it('accepts mixed Content[] (text + image + file)', () => { + const msg = MessageSchema.parse({ + role: 'assistant', + content: [ + { type: 'text', text: 'Analysis results:' }, + { type: 'image', media_type: 'image/png', path: '/chart.png' }, + { type: 'file', media_type: 'text/csv', path: '/data.csv' }, + ], + }); + const content = msg.content as { type: string }[]; + expect(content).toHaveLength(3); + expect(content.map((c) => c.type)).toEqual(['text', 'image', 'file']); + }); + + it('accepts undefined content', () => { + const msg = MessageSchema.parse({ role: 'tool' }); + expect(msg.content).toBeUndefined(); + }); +}); + describe('CodeGraderInputSchema', () => { const validInput = { criteria: 'The answer should be 4', @@ -68,6 +192,44 @@ describe('CodeGraderInputSchema', () => { const result = CodeGraderInputSchema.parse(inputWithOutput); expect(result.output?.[0].toolCalls?.[0].tool).toBe('read'); }); + + it('accepts output with Content[] containing image blocks', () => { + const inputWithImages = { + ...validInput, + output: [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'Generated chart:' }, + { type: 'image', media_type: 'image/png', path: '/workspace/chart.png' }, + ], + }, + ], + }; + const result = CodeGraderInputSchema.parse(inputWithImages); + const content = result.output?.[0].content as { type: string; path?: string }[]; + expect(content).toHaveLength(2); + expect(content[1].type).toBe('image'); + expect(content[1].path).toBe('/workspace/chart.png'); + }); + + it('accepts input with Content[] messages', () => { + const inputWithContentArray = { + ...validInput, + input: [ + { + role: 'user', + content: [ + { type: 'text', text: 'Describe this image:' }, + { type: 'image', media_type: 'image/jpeg', path: '/workspace/photo.jpg' }, + ], + }, + ], + }; + const result = CodeGraderInputSchema.parse(inputWithContentArray); + const content = result.input[0].content as { type: string }[]; + expect(content).toHaveLength(2); + }); }); describe('CodeGraderResultSchema', () => {