diff --git a/.changeset/brave-nights-shout.md b/.changeset/brave-nights-shout.md new file mode 100644 index 00000000..0098d79d --- /dev/null +++ b/.changeset/brave-nights-shout.md @@ -0,0 +1,6 @@ +--- +'@tanstack/ai-client': patch +'@tanstack/ai': patch +--- + +feat: Add multimodal UIMessage support for images, audio, video, and documents diff --git a/packages/typescript/ai-client/src/types.ts b/packages/typescript/ai-client/src/types.ts index 4f83debb..5ff40371 100644 --- a/packages/typescript/ai-client/src/types.ts +++ b/packages/typescript/ai-client/src/types.ts @@ -1,10 +1,14 @@ import type { AnyClientTool, + AudioPart, ChunkStrategy, + DocumentPart, + ImagePart, InferToolInput, InferToolOutput, ModelMessage, StreamChunk, + VideoPart, } from '@tanstack/ai' import type { ConnectionAdapter } from './connection-adapters' @@ -119,6 +123,10 @@ export type MessagePart = any> = | ToolCallPart | ToolResultPart | ThinkingPart + | ImagePart + | AudioPart + | VideoPart + | DocumentPart /** * UIMessage - Domain-specific message format optimized for building chat UIs diff --git a/packages/typescript/ai/src/activities/chat/messages.ts b/packages/typescript/ai/src/activities/chat/messages.ts index 14c8dc62..592a41b3 100644 --- a/packages/typescript/ai/src/activities/chat/messages.ts +++ b/packages/typescript/ai/src/activities/chat/messages.ts @@ -1,11 +1,15 @@ import type { + AudioPart, ContentPart, + DocumentPart, + ImagePart, MessagePart, ModelMessage, TextPart, ToolCallPart, ToolResultPart, UIMessage, + VideoPart, } from '../../types' // =========================== // Message Converters @@ -52,7 +56,8 @@ export function convertMessagesToModelMessages( * Convert a UIMessage to ModelMessage(s) * * This conversion handles the parts-based structure: - * - Text parts → content field + * - Text parts → content field (string or ContentPart[]) + * - Multimodal parts (image, audio, video, document) → ContentPart[] * - ToolCall parts → toolCalls array * - ToolResult parts → separate role="tool" messages * @@ -72,12 +77,24 @@ export function uiMessageToModelMessages( // Separate parts by type // Note: thinking parts are UI-only and not included in ModelMessages const textParts: Array = [] + const imageParts: Array = [] + const audioParts: Array = [] + const videoParts: Array = [] + const documentParts: Array = [] const toolCallParts: Array = [] const toolResultParts: Array = [] for (const part of uiMessage.parts) { if (part.type === 'text') { textParts.push(part) + } else if (part.type === 'image') { + imageParts.push(part) + } else if (part.type === 'audio') { + audioParts.push(part) + } else if (part.type === 'video') { + videoParts.push(part) + } else if (part.type === 'document') { + documentParts.push(part) } else if (part.type === 'tool-call') { toolCallParts.push(part) } else if (part.type === 'tool-result') { @@ -86,8 +103,29 @@ export function uiMessageToModelMessages( // thinking parts are skipped - they're UI-only } - // Build the main message (user or assistant) - const content = textParts.map((p) => p.content).join('') || null + const hasMultimodalContent = + imageParts.length > 0 || + audioParts.length > 0 || + videoParts.length > 0 || + documentParts.length > 0 + + // Build the content field - use ContentPart[] if multimodal, string otherwise + let content: string | null | Array + if (hasMultimodalContent) { + content = + uiMessage.parts.filter( + (part): part is ContentPart => + part.type === 'text' || + part.type === 'image' || + part.type === 'audio' || + part.type === 'video' || + part.type === 'document', + ) || null + } else { + // Text-only: use simple string + content = textParts.map((p) => p.content).join('') || null + } + const toolCalls = toolCallParts.length > 0 ? toolCallParts @@ -144,7 +182,7 @@ export function uiMessageToModelMessages( * Convert a ModelMessage to UIMessage * * This conversion creates a parts-based structure: - * - content field → TextPart + * - content field → TextPart (for string) or multimodal MessageParts (for ContentPart[]) * - toolCalls array → ToolCallPart[] * - role="tool" messages should be converted separately and merged * @@ -158,13 +196,18 @@ export function modelMessageToUIMessage( ): UIMessage { const parts: Array = [] - // Handle content (convert multimodal content to text for UI) - const textContent = getTextContent(modelMessage.content) - if (textContent) { - parts.push({ - type: 'text', - content: textContent, - }) + // Handle content - preserve multimodal content + if (modelMessage.content !== null) { + if (typeof modelMessage.content === 'string') { + if (modelMessage.content) { + parts.push({ + type: 'text', + content: modelMessage.content, + }) + } + } else if (Array.isArray(modelMessage.content)) { + parts.push(...(modelMessage.content as Array)) + } } // Handle tool calls diff --git a/packages/typescript/ai/src/types.ts b/packages/typescript/ai/src/types.ts index 7c49d995..e0f98463 100644 --- a/packages/typescript/ai/src/types.ts +++ b/packages/typescript/ai/src/types.ts @@ -285,6 +285,10 @@ export type MessagePart = | ToolCallPart | ToolResultPart | ThinkingPart + | ImagePart + | AudioPart + | VideoPart + | DocumentPart /** * UIMessage - Domain-specific message format optimized for building chat UIs diff --git a/packages/typescript/ai/tests/messages.test.ts b/packages/typescript/ai/tests/messages.test.ts new file mode 100644 index 00000000..4cde2b60 --- /dev/null +++ b/packages/typescript/ai/tests/messages.test.ts @@ -0,0 +1,190 @@ +import { describe, expect, it } from 'vitest' +import { + modelMessageToUIMessage, + uiMessageToModelMessages, +} from '../src/activities/chat/messages' +import type { ModelMessage, UIMessage } from '../src/types' + +describe('message converters', () => { + describe('modelMessageToUIMessage', () => { + it('should preserve text content', () => { + const modelMessage: ModelMessage = { + role: 'user', + content: 'Hello world', + } + + const uiMessage = modelMessageToUIMessage(modelMessage) + + expect(uiMessage.parts).toHaveLength(1) + expect(uiMessage.parts[0]).toEqual({ + type: 'text', + content: 'Hello world', + }) + }) + + it('should preserve multimodal content', () => { + const modelMessage: ModelMessage = { + role: 'user', + content: [ + { type: 'text', content: 'What is in this image?' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/image.jpg' }, + }, + ], + } + + const uiMessage = modelMessageToUIMessage(modelMessage) + + expect(uiMessage.parts).toHaveLength(2) + expect(uiMessage.parts[0]).toEqual({ + type: 'text', + content: 'What is in this image?', + }) + expect(uiMessage.parts[1]).toEqual({ + type: 'image', + source: { type: 'url', value: 'https://example.com/image.jpg' }, + }) + }) + + it('should preserve all multimodal types', () => { + const modelMessage: ModelMessage = { + role: 'user', + content: [ + { type: 'text', content: 'Check these:' }, + { type: 'image', source: { type: 'data', value: 'base64img' } }, + { + type: 'audio', + source: { type: 'url', value: 'https://example.com/audio.mp3' }, + }, + { + type: 'video', + source: { type: 'url', value: 'https://example.com/video.mp4' }, + }, + { type: 'document', source: { type: 'data', value: 'base64pdf' } }, + ], + } + + const uiMessage = modelMessageToUIMessage(modelMessage) + + expect(uiMessage.parts).toHaveLength(5) + expect(uiMessage.parts[0]?.type).toBe('text') + expect(uiMessage.parts[1]?.type).toBe('image') + expect(uiMessage.parts[2]?.type).toBe('audio') + expect(uiMessage.parts[3]?.type).toBe('video') + expect(uiMessage.parts[4]?.type).toBe('document') + }) + + it('should preserve metadata', () => { + const modelMessage: ModelMessage = { + role: 'user', + content: [ + { + type: 'image', + source: { type: 'url', value: 'https://example.com/img.jpg' }, + metadata: { detail: 'high' }, + }, + ], + } + + const uiMessage = modelMessageToUIMessage(modelMessage) + + expect(uiMessage.parts[0]).toEqual({ + type: 'image', + source: { type: 'url', value: 'https://example.com/img.jpg' }, + metadata: { detail: 'high' }, + }) + }) + }) + + describe('uiMessageToModelMessages', () => { + it('should convert text-only UIMessage to string content', () => { + const uiMessage: UIMessage = { + id: 'msg-1', + role: 'user', + parts: [{ type: 'text', content: 'Hello world' }], + } + + const modelMessages = uiMessageToModelMessages(uiMessage) + + expect(modelMessages).toHaveLength(1) + expect(modelMessages[0]?.content).toBe('Hello world') + }) + + it('should convert multimodal UIMessage to ContentPart array', () => { + const uiMessage: UIMessage = { + id: 'msg-1', + role: 'user', + parts: [ + { type: 'text', content: 'What is this?' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/img.jpg' }, + }, + ], + } + + const modelMessages = uiMessageToModelMessages(uiMessage) + + expect(modelMessages).toHaveLength(1) + expect(Array.isArray(modelMessages[0]?.content)).toBe(true) + expect(modelMessages[0]?.content).toHaveLength(2) + }) + + it('should preserve part order during conversion', () => { + const uiMessage: UIMessage = { + id: 'msg-1', + role: 'user', + parts: [ + { + type: 'image', + source: { type: 'url', value: 'https://example.com/1.jpg' }, + }, + { type: 'text', content: 'Middle text' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/2.jpg' }, + }, + ], + } + + const modelMessages = uiMessageToModelMessages(uiMessage) + const content = modelMessages[0]?.content as Array + + expect(content[0]?.type).toBe('image') + expect(content[1]?.type).toBe('text') + expect(content[2]?.type).toBe('image') + }) + }) + + describe('round-trip conversion', () => { + it('should preserve multimodal content through round-trip', () => { + const original: ModelMessage = { + role: 'user', + content: [ + { type: 'text', content: 'Describe this image' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/photo.jpg' }, + }, + ], + } + + const uiMessage = modelMessageToUIMessage(original) + const [converted] = uiMessageToModelMessages(uiMessage) + + expect(converted?.role).toBe('user') + expect(Array.isArray(converted?.content)).toBe(true) + const content = converted?.content as Array + expect(content).toHaveLength(2) + expect(content[0]).toEqual({ + type: 'text', + content: 'Describe this image', + }) + expect(content[1]).toEqual({ + type: 'image', + source: { type: 'url', value: 'https://example.com/photo.jpg' }, + }) + }) + }) +})