From 9ab06414a02fd639a1e8dcf9d9fa4030b66ee1c3 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 02:00:12 +0000 Subject: [PATCH] feat(core): Content union type for multimodal content model Introduce a discriminated union Content type (ContentText | ContentImage | ContentFile) that enables multimodal content to flow through the pipeline without lossy flattening. Changes: - Add packages/core/src/evaluation/content.ts with Content union type, type guards (isContent, isContentArray), and getTextContent() accessor - Update Message.content from 'unknown' to 'string | Content[]' - Update extractLastAssistantContent() to handle Content[] via getTextContent() - Update claude-cli provider to preserve non-text content blocks (images) instead of dropping them during extraction - Update cli provider to handle Content[] from external processes - Export all content types from @agentv/core public API - Add 25 unit tests covering type guards, accessors, backward compat, and extractLastAssistantContent with Content[] Closes #817 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- packages/core/src/evaluation/content.ts | 103 +++++++++ .../src/evaluation/providers/claude-cli.ts | 47 +++- packages/core/src/evaluation/providers/cli.ts | 8 +- .../core/src/evaluation/providers/index.ts | 2 + .../core/src/evaluation/providers/types.ts | 11 +- packages/core/src/index.ts | 1 + packages/core/test/evaluation/content.test.ts | 205 ++++++++++++++++++ 7 files changed, 373 insertions(+), 4 deletions(-) create mode 100644 packages/core/src/evaluation/content.ts create mode 100644 packages/core/test/evaluation/content.test.ts diff --git a/packages/core/src/evaluation/content.ts b/packages/core/src/evaluation/content.ts new file mode 100644 index 000000000..48e61ad99 --- /dev/null +++ b/packages/core/src/evaluation/content.ts @@ -0,0 +1,103 @@ +/** + * Multimodal content types for the AgentV pipeline. + * + * Models structured content blocks (text, images, files) that flow end-to-end + * without lossy flattening. Modeled after Inspect AI's discriminated union approach. + * + * ## Content model + * + * `Message.content` accepts `string | Content[]`: + * - `string` — backward-compatible plain text (most common case) + * - `Content[]` — array of typed content blocks for multimodal messages + * + * Binary data (images, files) is referenced by URL/base64 string or filesystem + * path — never raw bytes. This keeps payloads serializable and lets code graders + * access files via path without decoding. + * + * ## How to extend + * + * To add a new content variant (e.g., `ContentAudio`): + * 1. Define the interface with a unique `type` discriminant + * 2. Add it to the `Content` union + * 3. Update `getTextContent()` if the new type has extractable text + * 4. Update `isContent()` type guard with the new type string + */ + +// --------------------------------------------------------------------------- +// Content block types +// --------------------------------------------------------------------------- + +/** A text content block. */ +export interface ContentText { + readonly type: 'text'; + readonly text: string; +} + +/** + * An image content block. + * `source` is a URL, data URI (base64), or filesystem path. + */ +export interface ContentImage { + readonly type: 'image'; + readonly media_type: string; + readonly source: string; +} + +/** + * A file content block. + * `path` is a filesystem path or URL referencing the file. + */ +export interface ContentFile { + readonly type: 'file'; + readonly media_type: string; + readonly path: string; +} + +/** Discriminated union of all content block types. */ +export type Content = ContentText | ContentImage | ContentFile; + +// --------------------------------------------------------------------------- +// Type guards +// --------------------------------------------------------------------------- + +const CONTENT_TYPES = new Set(['text', 'image', 'file']); + +/** Check whether a value is a valid `Content` block. */ +export function isContent(value: unknown): value is Content { + if (!value || typeof value !== 'object') return false; + const v = value as Record; + return typeof v.type === 'string' && CONTENT_TYPES.has(v.type); +} + +/** Check whether a value is a `Content[]` array (at least one valid block). */ +export function isContentArray(value: unknown): value is Content[] { + return Array.isArray(value) && value.length > 0 && value.every(isContent); +} + +// --------------------------------------------------------------------------- +// Accessors +// --------------------------------------------------------------------------- + +/** + * Extract plain text from `string | Content[]`. + * + * - If `content` is a string, returns it directly. + * - If `content` is a `Content[]`, concatenates all `ContentText.text` values + * (separated by newlines) and returns the result. + * - Returns `''` for `undefined`/`null`/unrecognized shapes. + * + * This is a **non-destructive** accessor — the original `Content[]` is preserved. + */ +export function getTextContent(content: string | Content[] | undefined | null): string { + if (content == null) return ''; + if (typeof content === 'string') return content; + if (!Array.isArray(content)) return ''; + + const parts: string[] = []; + for (const block of content) { + if (block.type === 'text') { + parts.push(block.text); + } + } + return parts.join('\n'); +} diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts index 27fa2e200..d400c2069 100644 --- a/packages/core/src/evaluation/providers/claude-cli.ts +++ b/packages/core/src/evaluation/providers/claude-cli.ts @@ -5,6 +5,7 @@ import type { WriteStream } from 'node:fs'; import { mkdir } from 'node:fs/promises'; import path from 'node:path'; +import type { Content } from '../content.js'; import { recordClaudeLogEntry } from './claude-log-tracker.js'; import { buildPromptDocument, normalizeInputFiles } from './preread.js'; import type { ClaudeResolvedConfig } from './targets.js'; @@ -78,12 +79,13 @@ export class ClaudeCliProvider implements Provider { if (betaMessage && typeof betaMessage === 'object') { const msg = betaMessage as Record; const content = msg.content; + const structuredContent = toContentArray(content); const textContent = extractTextContent(content); const toolCalls = extractToolCalls(content); const outputMsg: Message = { role: 'assistant', - content: textContent, + content: structuredContent ?? textContent, toolCalls: toolCalls.length > 0 ? toolCalls : undefined, }; output.push(outputMsg); @@ -477,6 +479,49 @@ function summarizeEvent(event: Record): string | undefined { } } +/** + * Convert Claude's content array to Content[] preserving non-text blocks. + * Returns undefined if content is a plain string or has only text blocks + * (no benefit over the simpler string representation). + */ +function toContentArray(content: unknown): Content[] | undefined { + if (!Array.isArray(content)) return undefined; + + let hasNonText = false; + const blocks: Content[] = []; + + for (const part of content) { + if (!part || typeof part !== 'object') continue; + const p = part as Record; + + if (p.type === 'text' && typeof p.text === 'string') { + blocks.push({ type: 'text', text: p.text }); + } else if (p.type === 'image' && typeof p.source === 'object' && p.source !== null) { + const src = p.source as Record; + const mediaType = + typeof p.media_type === 'string' + ? p.media_type + : typeof src.media_type === 'string' + ? src.media_type + : 'application/octet-stream'; + const data = + typeof src.data === 'string' + ? `data:${mediaType};base64,${src.data}` + : typeof p.url === 'string' + ? (p.url as string) + : ''; + blocks.push({ type: 'image', media_type: mediaType, source: data }); + hasNonText = true; + } else if (p.type === 'tool_use') { + // tool_use blocks are handled separately as ToolCall — skip + } else if (p.type === 'tool_result') { + // tool_result blocks are not user content — skip + } + } + + return hasNonText && blocks.length > 0 ? blocks : undefined; +} + /** * Extract text content from Claude's content array format. */ diff --git a/packages/core/src/evaluation/providers/cli.ts b/packages/core/src/evaluation/providers/cli.ts index c1bb15f55..2a98a85cb 100644 --- a/packages/core/src/evaluation/providers/cli.ts +++ b/packages/core/src/evaluation/providers/cli.ts @@ -6,6 +6,8 @@ import { promisify } from 'node:util'; import { z } from 'zod'; +import type { Content } from '../content.js'; +import { isContentArray } from '../content.js'; import { readTextFile } from '../file-utils.js'; import type { CliResolvedConfig } from './targets.js'; import type { @@ -124,7 +126,11 @@ function convertMessages( return messages.map((msg) => ({ role: msg.role, name: msg.name, - content: msg.content, + content: isContentArray(msg.content) + ? (msg.content as Content[]) + : typeof msg.content === 'string' + ? msg.content + : undefined, toolCalls: msg.tool_calls?.map((tc) => ({ tool: tc.tool, input: tc.input, diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts index cd6658396..1a215b46b 100644 --- a/packages/core/src/evaluation/providers/index.ts +++ b/packages/core/src/evaluation/providers/index.ts @@ -37,6 +37,8 @@ export type { ToolCall, } from './types.js'; +export { extractLastAssistantContent } from './types.js'; + export type { AgentVResolvedConfig, AnthropicResolvedConfig, diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index eb139c907..33cf09d9a 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -1,3 +1,5 @@ +import type { Content } from '../content.js'; +import { getTextContent, isContentArray } from '../content.js'; import type { JsonObject } from '../types.js'; export type ChatMessageRole = 'system' | 'user' | 'assistant' | 'tool' | 'function'; @@ -169,8 +171,8 @@ export interface Message { readonly role: string; /** Optional name for the message sender */ readonly name?: string; - /** Message content */ - readonly content?: unknown; + /** Message content — plain string or structured content blocks for multimodal data. */ + readonly content?: string | Content[]; /** Tool calls made in this message */ readonly toolCalls?: readonly ToolCall[]; /** ISO 8601 timestamp when the message started */ @@ -222,6 +224,8 @@ export interface ProviderResponse { /** * Extract the content from the last assistant message in an output message array. * Returns empty string if no assistant message found. + * + * Handles both plain-string content and Content[] (extracts text blocks). */ export function extractLastAssistantContent(messages: readonly Message[] | undefined): string { if (!messages || messages.length === 0) { @@ -235,6 +239,9 @@ export function extractLastAssistantContent(messages: readonly Message[] | undef if (typeof msg.content === 'string') { return msg.content; } + if (isContentArray(msg.content)) { + return getTextContent(msg.content); + } return JSON.stringify(msg.content); } } diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 0e457b4d9..ed78dc5ab 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -1,3 +1,4 @@ +export * from './evaluation/content.js'; export * from './evaluation/types.js'; export * from './evaluation/trace.js'; export * from './evaluation/yaml-parser.js'; diff --git a/packages/core/test/evaluation/content.test.ts b/packages/core/test/evaluation/content.test.ts new file mode 100644 index 000000000..ff4ce33d9 --- /dev/null +++ b/packages/core/test/evaluation/content.test.ts @@ -0,0 +1,205 @@ +import { describe, expect, it } from 'vitest'; + +import { + type Content, + type ContentFile, + type ContentImage, + type ContentText, + getTextContent, + isContent, + isContentArray, +} from '../../src/evaluation/content.js'; +import { type Message, extractLastAssistantContent } from '../../src/evaluation/providers/types.js'; + +// --------------------------------------------------------------------------- +// Content type guards +// --------------------------------------------------------------------------- + +describe('isContent', () => { + it('returns true for ContentText', () => { + expect(isContent({ type: 'text', text: 'hello' })).toBe(true); + }); + + it('returns true for ContentImage', () => { + expect(isContent({ type: 'image', media_type: 'image/png', source: 'data:...' })).toBe(true); + }); + + it('returns true for ContentFile', () => { + expect(isContent({ type: 'file', media_type: 'text/plain', path: '/tmp/f.txt' })).toBe(true); + }); + + it('returns false for non-object values', () => { + expect(isContent(null)).toBe(false); + expect(isContent(undefined)).toBe(false); + expect(isContent('text')).toBe(false); + expect(isContent(42)).toBe(false); + }); + + it('returns false for objects with unknown type', () => { + expect(isContent({ type: 'audio', data: '...' })).toBe(false); + expect(isContent({ type: 123 })).toBe(false); + expect(isContent({})).toBe(false); + }); +}); + +describe('isContentArray', () => { + it('returns true for array of valid Content blocks', () => { + const blocks: Content[] = [ + { type: 'text', text: 'hello' }, + { type: 'image', media_type: 'image/png', source: 'data:...' }, + ]; + expect(isContentArray(blocks)).toBe(true); + }); + + it('returns false for empty array', () => { + expect(isContentArray([])).toBe(false); + }); + + it('returns false for array with non-Content items', () => { + expect(isContentArray([{ type: 'unknown' }])).toBe(false); + expect(isContentArray(['hello'])).toBe(false); + }); + + it('returns false for non-array values', () => { + expect(isContentArray('text')).toBe(false); + expect(isContentArray(null)).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// getTextContent +// --------------------------------------------------------------------------- + +describe('getTextContent', () => { + it('returns string content directly', () => { + expect(getTextContent('hello world')).toBe('hello world'); + }); + + it('returns empty string for undefined', () => { + expect(getTextContent(undefined)).toBe(''); + }); + + it('returns empty string for null', () => { + expect(getTextContent(null)).toBe(''); + }); + + it('extracts text from ContentText blocks', () => { + const content: Content[] = [ + { type: 'text', text: 'line 1' }, + { type: 'text', text: 'line 2' }, + ]; + expect(getTextContent(content)).toBe('line 1\nline 2'); + }); + + it('skips non-text blocks when extracting text', () => { + const content: Content[] = [ + { type: 'text', text: 'hello' }, + { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,...' }, + { type: 'text', text: 'world' }, + ]; + expect(getTextContent(content)).toBe('hello\nworld'); + }); + + it('returns empty string for Content[] with no text blocks', () => { + const content: Content[] = [ + { type: 'image', media_type: 'image/png', source: 'data:...' }, + { type: 'file', media_type: 'text/plain', path: '/f.txt' }, + ]; + expect(getTextContent(content)).toBe(''); + }); + + it('handles single text block', () => { + const content: Content[] = [{ type: 'text', text: 'only text' }]; + expect(getTextContent(content)).toBe('only text'); + }); +}); + +// --------------------------------------------------------------------------- +// extractLastAssistantContent with Content[] +// --------------------------------------------------------------------------- + +describe('extractLastAssistantContent with Content[]', () => { + it('extracts text from Content[] in assistant message', () => { + const messages: Message[] = [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'Here is the chart:' }, + { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,abc' }, + ], + }, + ]; + expect(extractLastAssistantContent(messages)).toBe('Here is the chart:'); + }); + + it('still works with plain string content (backward compat)', () => { + const messages: Message[] = [{ role: 'assistant', content: 'plain text response' }]; + expect(extractLastAssistantContent(messages)).toBe('plain text response'); + }); + + it('returns empty string for no assistant messages', () => { + const messages: Message[] = [{ role: 'user', content: 'question' }]; + expect(extractLastAssistantContent(messages)).toBe(''); + }); + + it('returns empty string for undefined messages', () => { + expect(extractLastAssistantContent(undefined)).toBe(''); + expect(extractLastAssistantContent([])).toBe(''); + }); + + it('finds the last assistant message in a conversation', () => { + const messages: Message[] = [ + { role: 'assistant', content: 'first response' }, + { role: 'user', content: 'follow-up' }, + { + role: 'assistant', + content: [ + { type: 'text', text: 'second response' }, + { type: 'file', media_type: 'text/csv', path: '/data.csv' }, + ], + }, + ]; + expect(extractLastAssistantContent(messages)).toBe('second response'); + }); +}); + +// --------------------------------------------------------------------------- +// Type compatibility — compile-time checks +// --------------------------------------------------------------------------- + +describe('Message type compatibility', () => { + it('accepts string content', () => { + const msg: Message = { role: 'assistant', content: 'hello' }; + expect(msg.content).toBe('hello'); + }); + + it('accepts Content[] content', () => { + const msg: Message = { + role: 'assistant', + content: [ + { type: 'text', text: 'hello' }, + { type: 'image', media_type: 'image/png', source: 'base64data' }, + ], + }; + expect(Array.isArray(msg.content)).toBe(true); + }); + + it('accepts undefined content', () => { + const msg: Message = { role: 'assistant' }; + expect(msg.content).toBeUndefined(); + }); + + it('preserves Content subtypes in Content[]', () => { + const text: ContentText = { type: 'text', text: 'hi' }; + const image: ContentImage = { type: 'image', media_type: 'image/jpeg', source: '/img.jpg' }; + const file: ContentFile = { type: 'file', media_type: 'application/pdf', path: '/doc.pdf' }; + + const msg: Message = { role: 'assistant', content: [text, image, file] }; + const blocks = msg.content as Content[]; + + expect(blocks).toHaveLength(3); + expect(blocks[0].type).toBe('text'); + expect(blocks[1].type).toBe('image'); + expect(blocks[2].type).toBe('file'); + }); +});