From 9ab06414a02fd639a1e8dcf9d9fa4030b66ee1c3 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 29 Mar 2026 02:00:12 +0000
Subject: [PATCH] feat(core): Content union type for multimodal content model

Introduce a discriminated union Content type (ContentText | ContentImage |
ContentFile) that enables multimodal content to flow through the pipeline
without lossy flattening.

Changes:
- Add packages/core/src/evaluation/content.ts with Content union type,
  type guards (isContent, isContentArray), and getTextContent() accessor
- Update Message.content from 'unknown' to 'string | Content[]'
- Update extractLastAssistantContent() to handle Content[] via getTextContent()
- Update claude-cli provider to preserve non-text content blocks (images)
  instead of dropping them during extraction
- Update cli provider to handle Content[] from external processes
- Export all content types from @agentv/core public API
- Add 25 unit tests covering type guards, accessors, backward compat,
  and extractLastAssistantContent with Content[]

Closes #817

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 packages/core/src/evaluation/content.ts       | 103 +++++++++
 .../src/evaluation/providers/claude-cli.ts    |  47 +++-
 packages/core/src/evaluation/providers/cli.ts |   8 +-
 .../core/src/evaluation/providers/index.ts    |   2 +
 .../core/src/evaluation/providers/types.ts    |  11 +-
 packages/core/src/index.ts                    |   1 +
 packages/core/test/evaluation/content.test.ts | 205 ++++++++++++++++++
 7 files changed, 373 insertions(+), 4 deletions(-)
 create mode 100644 packages/core/src/evaluation/content.ts
 create mode 100644 packages/core/test/evaluation/content.test.ts
diff --git a/packages/core/src/evaluation/content.ts b/packages/core/src/evaluation/content.ts
new file mode 100644
index 000000000..48e61ad99
--- /dev/null
+++ b/packages/core/src/evaluation/content.ts
@@ -0,0 +1,103 @@
+/**
+ * Multimodal content types for the AgentV pipeline.
+ *
+ * Models structured content blocks (text, images, files) that flow end-to-end
+ * without lossy flattening. Modeled after Inspect AI's discriminated union approach.
+ *
+ * ## Content model
+ *
+ * `Message.content` accepts `string | Content[]`:
+ * - `string` — backward-compatible plain text (most common case)
+ * - `Content[]` — array of typed content blocks for multimodal messages
+ *
+ * Binary data (images, files) is referenced by URL/base64 string or filesystem
+ * path — never raw bytes. This keeps payloads serializable and lets code graders
+ * access files via path without decoding.
+ *
+ * ## How to extend
+ *
+ * To add a new content variant (e.g., `ContentAudio`):
+ * 1. Define the interface with a unique `type` discriminant
+ * 2. Add it to the `Content` union
+ * 3. Update `getTextContent()` if the new type has extractable text
+ * 4. Update `isContent()` type guard with the new type string
+ */
+
+// ---------------------------------------------------------------------------
+// Content block types
+// ---------------------------------------------------------------------------
+
+/** A text content block. */
+export interface ContentText {
+  readonly type: 'text';
+  readonly text: string;
+}
+
+/**
+ * An image content block.
+ * `source` is a URL, data URI (base64), or filesystem path.
+ */
+export interface ContentImage {
+  readonly type: 'image';
+  readonly media_type: string;
+  readonly source: string;
+}
+
+/**
+ * A file content block.
+ * `path` is a filesystem path or URL referencing the file.
+ */
+export interface ContentFile {
+  readonly type: 'file';
+  readonly media_type: string;
+  readonly path: string;
+}
+
+/** Discriminated union of all content block types. */
+export type Content = ContentText | ContentImage | ContentFile;
+
+// ---------------------------------------------------------------------------
+// Type guards
+// ---------------------------------------------------------------------------
+
+const CONTENT_TYPES = new Set<string>(['text', 'image', 'file']);
+
+/** Check whether a value is a valid `Content` block. */
+export function isContent(value: unknown): value is Content {
+  if (!value || typeof value !== 'object') return false;
+  const v = value as Record<string, unknown>;
+  return typeof v.type === 'string' && CONTENT_TYPES.has(v.type);
+}
+
+/** Check whether a value is a `Content[]` array (at least one valid block). */
+export function isContentArray(value: unknown): value is Content[] {
+  return Array.isArray(value) && value.length > 0 && value.every(isContent);
+}
+
+// ---------------------------------------------------------------------------
+// Accessors
+// ---------------------------------------------------------------------------
+
+/**
+ * Extract plain text from `string | Content[]`.
+ *
+ * - If `content` is a string, returns it directly.
+ * - If `content` is a `Content[]`, concatenates all `ContentText.text` values
+ *   (separated by newlines) and returns the result.
+ * - Returns `''` for `undefined`/`null`/unrecognized shapes.
+ *
+ * This is a **non-destructive** accessor — the original `Content[]` is preserved.
+ */
+export function getTextContent(content: string | Content[] | undefined | null): string {
+  if (content == null) return '';
+  if (typeof content === 'string') return content;
+  if (!Array.isArray(content)) return '';
+
+  const parts: string[] = [];
+  for (const block of content) {
+    if (block.type === 'text') {
+      parts.push(block.text);
+    }
+  }
+  return parts.join('\n');
+}
diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts
index 27fa2e200..d400c2069 100644
--- a/packages/core/src/evaluation/providers/claude-cli.ts
+++ b/packages/core/src/evaluation/providers/claude-cli.ts
@@ -5,6 +5,7 @@ import type { WriteStream } from 'node:fs';
 import { mkdir } from 'node:fs/promises';
 import path from 'node:path';
 
+import type { Content } from '../content.js';
 import { recordClaudeLogEntry } from './claude-log-tracker.js';
 import { buildPromptDocument, normalizeInputFiles } from './preread.js';
 import type { ClaudeResolvedConfig } from './targets.js';
@@ -78,12 +79,13 @@ export class ClaudeCliProvider implements Provider {
             if (betaMessage && typeof betaMessage === 'object') {
               const msg = betaMessage as Record<string, unknown>;
               const content = msg.content;
+              const structuredContent = toContentArray(content);
               const textContent = extractTextContent(content);
               const toolCalls = extractToolCalls(content);
 
               const outputMsg: Message = {
                 role: 'assistant',
-                content: textContent,
+                content: structuredContent ?? textContent,
                 toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
               };
               output.push(outputMsg);
@@ -477,6 +479,49 @@ function summarizeEvent(event: Record<string, unknown>): string | undefined {
   }
 }
 
+/**
+ * Convert Claude's content array to Content[] preserving non-text blocks.
+ * Returns undefined if content is a plain string or has only text blocks
+ * (no benefit over the simpler string representation).
+ */
+function toContentArray(content: unknown): Content[] | undefined {
+  if (!Array.isArray(content)) return undefined;
+
+  let hasNonText = false;
+  const blocks: Content[] = [];
+
+  for (const part of content) {
+    if (!part || typeof part !== 'object') continue;
+    const p = part as Record<string, unknown>;
+
+    if (p.type === 'text' && typeof p.text === 'string') {
+      blocks.push({ type: 'text', text: p.text });
+    } else if (p.type === 'image' && typeof p.source === 'object' && p.source !== null) {
+      const src = p.source as Record<string, unknown>;
+      const mediaType =
+        typeof p.media_type === 'string'
+          ? p.media_type
+          : typeof src.media_type === 'string'
+            ? src.media_type
+            : 'application/octet-stream';
+      const data =
+        typeof src.data === 'string'
+          ? `data:${mediaType};base64,${src.data}`
+          : typeof p.url === 'string'
+            ? (p.url as string)
+            : '';
+      blocks.push({ type: 'image', media_type: mediaType, source: data });
+      hasNonText = true;
+    } else if (p.type === 'tool_use') {
+      // tool_use blocks are handled separately as ToolCall — skip
+    } else if (p.type === 'tool_result') {
+      // tool_result blocks are not user content — skip
+    }
+  }
+
+  return hasNonText && blocks.length > 0 ? blocks : undefined;
+}
+
 /**
  * Extract text content from Claude's content array format.
  */
diff --git a/packages/core/src/evaluation/providers/cli.ts b/packages/core/src/evaluation/providers/cli.ts
index c1bb15f55..2a98a85cb 100644
--- a/packages/core/src/evaluation/providers/cli.ts
+++ b/packages/core/src/evaluation/providers/cli.ts
@@ -6,6 +6,8 @@ import { promisify } from 'node:util';
 
 import { z } from 'zod';
 
+import type { Content } from '../content.js';
+import { isContentArray } from '../content.js';
 import { readTextFile } from '../file-utils.js';
 import type { CliResolvedConfig } from './targets.js';
 import type {
@@ -124,7 +126,11 @@ function convertMessages(
   return messages.map((msg) => ({
     role: msg.role,
     name: msg.name,
-    content: msg.content,
+    content: isContentArray(msg.content)
+      ? (msg.content as Content[])
+      : typeof msg.content === 'string'
+        ? msg.content
+        : undefined,
     toolCalls: msg.tool_calls?.map((tc) => ({
       tool: tc.tool,
       input: tc.input,
diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts
index cd6658396..1a215b46b 100644
--- a/packages/core/src/evaluation/providers/index.ts
+++ b/packages/core/src/evaluation/providers/index.ts
@@ -37,6 +37,8 @@ export type {
   ToolCall,
 } from './types.js';
 
+export { extractLastAssistantContent } from './types.js';
+
 export type {
   AgentVResolvedConfig,
   AnthropicResolvedConfig,
diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts
index eb139c907..33cf09d9a 100644
--- a/packages/core/src/evaluation/providers/types.ts
+++ b/packages/core/src/evaluation/providers/types.ts
@@ -1,3 +1,5 @@
+import type { Content } from '../content.js';
+import { getTextContent, isContentArray } from '../content.js';
 import type { JsonObject } from '../types.js';
 
 export type ChatMessageRole = 'system' | 'user' | 'assistant' | 'tool' | 'function';
@@ -169,8 +171,8 @@ export interface Message {
   readonly role: string;
   /** Optional name for the message sender */
   readonly name?: string;
-  /** Message content */
-  readonly content?: unknown;
+  /** Message content — plain string or structured content blocks for multimodal data. */
+  readonly content?: string | Content[];
   /** Tool calls made in this message */
   readonly toolCalls?: readonly ToolCall[];
   /** ISO 8601 timestamp when the message started */
@@ -222,6 +224,8 @@ export interface ProviderResponse {
 /**
  * Extract the content from the last assistant message in an output message array.
  * Returns empty string if no assistant message found.
+ *
+ * Handles both plain-string content and Content[] (extracts text blocks).
  */
 export function extractLastAssistantContent(messages: readonly Message[] | undefined): string {
   if (!messages || messages.length === 0) {
@@ -235,6 +239,9 @@ export function extractLastAssistantContent(messages: readonly Message[] | undef
       if (typeof msg.content === 'string') {
         return msg.content;
       }
+      if (isContentArray(msg.content)) {
+        return getTextContent(msg.content);
+      }
       return JSON.stringify(msg.content);
     }
   }
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 0e457b4d9..ed78dc5ab 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -1,3 +1,4 @@
+export * from './evaluation/content.js';
 export * from './evaluation/types.js';
 export * from './evaluation/trace.js';
 export * from './evaluation/yaml-parser.js';
diff --git a/packages/core/test/evaluation/content.test.ts b/packages/core/test/evaluation/content.test.ts
new file mode 100644
index 000000000..ff4ce33d9
--- /dev/null
+++ b/packages/core/test/evaluation/content.test.ts
@@ -0,0 +1,205 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  type Content,
+  type ContentFile,
+  type ContentImage,
+  type ContentText,
+  getTextContent,
+  isContent,
+  isContentArray,
+} from '../../src/evaluation/content.js';
+import { type Message, extractLastAssistantContent } from '../../src/evaluation/providers/types.js';
+
+// ---------------------------------------------------------------------------
+// Content type guards
+// ---------------------------------------------------------------------------
+
+describe('isContent', () => {
+  it('returns true for ContentText', () => {
+    expect(isContent({ type: 'text', text: 'hello' })).toBe(true);
+  });
+
+  it('returns true for ContentImage', () => {
+    expect(isContent({ type: 'image', media_type: 'image/png', source: 'data:...' })).toBe(true);
+  });
+
+  it('returns true for ContentFile', () => {
+    expect(isContent({ type: 'file', media_type: 'text/plain', path: '/tmp/f.txt' })).toBe(true);
+  });
+
+  it('returns false for non-object values', () => {
+    expect(isContent(null)).toBe(false);
+    expect(isContent(undefined)).toBe(false);
+    expect(isContent('text')).toBe(false);
+    expect(isContent(42)).toBe(false);
+  });
+
+  it('returns false for objects with unknown type', () => {
+    expect(isContent({ type: 'audio', data: '...' })).toBe(false);
+    expect(isContent({ type: 123 })).toBe(false);
+    expect(isContent({})).toBe(false);
+  });
+});
+
+describe('isContentArray', () => {
+  it('returns true for array of valid Content blocks', () => {
+    const blocks: Content[] = [
+      { type: 'text', text: 'hello' },
+      { type: 'image', media_type: 'image/png', source: 'data:...' },
+    ];
+    expect(isContentArray(blocks)).toBe(true);
+  });
+
+  it('returns false for empty array', () => {
+    expect(isContentArray([])).toBe(false);
+  });
+
+  it('returns false for array with non-Content items', () => {
+    expect(isContentArray([{ type: 'unknown' }])).toBe(false);
+    expect(isContentArray(['hello'])).toBe(false);
+  });
+
+  it('returns false for non-array values', () => {
+    expect(isContentArray('text')).toBe(false);
+    expect(isContentArray(null)).toBe(false);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// getTextContent
+// ---------------------------------------------------------------------------
+
+describe('getTextContent', () => {
+  it('returns string content directly', () => {
+    expect(getTextContent('hello world')).toBe('hello world');
+  });
+
+  it('returns empty string for undefined', () => {
+    expect(getTextContent(undefined)).toBe('');
+  });
+
+  it('returns empty string for null', () => {
+    expect(getTextContent(null)).toBe('');
+  });
+
+  it('extracts text from ContentText blocks', () => {
+    const content: Content[] = [
+      { type: 'text', text: 'line 1' },
+      { type: 'text', text: 'line 2' },
+    ];
+    expect(getTextContent(content)).toBe('line 1\nline 2');
+  });
+
+  it('skips non-text blocks when extracting text', () => {
+    const content: Content[] = [
+      { type: 'text', text: 'hello' },
+      { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,...' },
+      { type: 'text', text: 'world' },
+    ];
+    expect(getTextContent(content)).toBe('hello\nworld');
+  });
+
+  it('returns empty string for Content[] with no text blocks', () => {
+    const content: Content[] = [
+      { type: 'image', media_type: 'image/png', source: 'data:...' },
+      { type: 'file', media_type: 'text/plain', path: '/f.txt' },
+    ];
+    expect(getTextContent(content)).toBe('');
+  });
+
+  it('handles single text block', () => {
+    const content: Content[] = [{ type: 'text', text: 'only text' }];
+    expect(getTextContent(content)).toBe('only text');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// extractLastAssistantContent with Content[]
+// ---------------------------------------------------------------------------
+
+describe('extractLastAssistantContent with Content[]', () => {
+  it('extracts text from Content[] in assistant message', () => {
+    const messages: Message[] = [
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'Here is the chart:' },
+          { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,abc' },
+        ],
+      },
+    ];
+    expect(extractLastAssistantContent(messages)).toBe('Here is the chart:');
+  });
+
+  it('still works with plain string content (backward compat)', () => {
+    const messages: Message[] = [{ role: 'assistant', content: 'plain text response' }];
+    expect(extractLastAssistantContent(messages)).toBe('plain text response');
+  });
+
+  it('returns empty string for no assistant messages', () => {
+    const messages: Message[] = [{ role: 'user', content: 'question' }];
+    expect(extractLastAssistantContent(messages)).toBe('');
+  });
+
+  it('returns empty string for undefined messages', () => {
+    expect(extractLastAssistantContent(undefined)).toBe('');
+    expect(extractLastAssistantContent([])).toBe('');
+  });
+
+  it('finds the last assistant message in a conversation', () => {
+    const messages: Message[] = [
+      { role: 'assistant', content: 'first response' },
+      { role: 'user', content: 'follow-up' },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'text', text: 'second response' },
+          { type: 'file', media_type: 'text/csv', path: '/data.csv' },
+        ],
+      },
+    ];
+    expect(extractLastAssistantContent(messages)).toBe('second response');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Type compatibility — compile-time checks
+// ---------------------------------------------------------------------------
+
+describe('Message type compatibility', () => {
+  it('accepts string content', () => {
+    const msg: Message = { role: 'assistant', content: 'hello' };
+    expect(msg.content).toBe('hello');
+  });
+
+  it('accepts Content[] content', () => {
+    const msg: Message = {
+      role: 'assistant',
+      content: [
+        { type: 'text', text: 'hello' },
+        { type: 'image', media_type: 'image/png', source: 'base64data' },
+      ],
+    };
+    expect(Array.isArray(msg.content)).toBe(true);
+  });
+
+  it('accepts undefined content', () => {
+    const msg: Message = { role: 'assistant' };
+    expect(msg.content).toBeUndefined();
+  });
+
+  it('preserves Content subtypes in Content[]', () => {
+    const text: ContentText = { type: 'text', text: 'hi' };
+    const image: ContentImage = { type: 'image', media_type: 'image/jpeg', source: '/img.jpg' };
+    const file: ContentFile = { type: 'file', media_type: 'application/pdf', path: '/doc.pdf' };
+
+    const msg: Message = { role: 'assistant', content: [text, image, file] };
+    const blocks = msg.content as Content[];
+
+    expect(blocks).toHaveLength(3);
+    expect(blocks[0].type).toBe('text');
+    expect(blocks[1].type).toBe('image');
+    expect(blocks[2].type).toBe('file');
+  });
+});