Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/brave-nights-shout.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
'@tanstack/ai-client': patch
'@tanstack/ai': patch
---

feat: Add multimodal UIMessage support for images, audio, video, and documents
8 changes: 8 additions & 0 deletions packages/typescript/ai-client/src/types.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import type {
AnyClientTool,
AudioPart,
ChunkStrategy,
DocumentPart,
ImagePart,
InferToolInput,
InferToolOutput,
ModelMessage,
StreamChunk,
VideoPart,
} from '@tanstack/ai'
import type { ConnectionAdapter } from './connection-adapters'

Expand Down Expand Up @@ -119,6 +123,10 @@ export type MessagePart<TTools extends ReadonlyArray<AnyClientTool> = any> =
| ToolCallPart<TTools>
| ToolResultPart
| ThinkingPart
| ImagePart
| AudioPart
| VideoPart
| DocumentPart

/**
* UIMessage - Domain-specific message format optimized for building chat UIs
Expand Down
65 changes: 54 additions & 11 deletions packages/typescript/ai/src/activities/chat/messages.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import type {
AudioPart,
ContentPart,
DocumentPart,
ImagePart,
MessagePart,
ModelMessage,
TextPart,
ToolCallPart,
ToolResultPart,
UIMessage,
VideoPart,
} from '../../types'
// ===========================
// Message Converters
Expand Down Expand Up @@ -52,7 +56,8 @@ export function convertMessagesToModelMessages(
* Convert a UIMessage to ModelMessage(s)
*
* This conversion handles the parts-based structure:
* - Text parts β†’ content field
* - Text parts β†’ content field (string or ContentPart[])
* - Multimodal parts (image, audio, video, document) β†’ ContentPart[]
* - ToolCall parts β†’ toolCalls array
* - ToolResult parts β†’ separate role="tool" messages
*
Expand All @@ -72,12 +77,24 @@ export function uiMessageToModelMessages(
// Separate parts by type
// Note: thinking parts are UI-only and not included in ModelMessages
const textParts: Array<TextPart> = []
const imageParts: Array<ImagePart> = []
const audioParts: Array<AudioPart> = []
const videoParts: Array<VideoPart> = []
const documentParts: Array<DocumentPart> = []
const toolCallParts: Array<ToolCallPart> = []
const toolResultParts: Array<ToolResultPart> = []

for (const part of uiMessage.parts) {
if (part.type === 'text') {
textParts.push(part)
} else if (part.type === 'image') {
imageParts.push(part)
} else if (part.type === 'audio') {
audioParts.push(part)
} else if (part.type === 'video') {
videoParts.push(part)
} else if (part.type === 'document') {
documentParts.push(part)
} else if (part.type === 'tool-call') {
toolCallParts.push(part)
} else if (part.type === 'tool-result') {
Expand All @@ -86,8 +103,29 @@ export function uiMessageToModelMessages(
// thinking parts are skipped - they're UI-only
}

// Build the main message (user or assistant)
const content = textParts.map((p) => p.content).join('') || null
const hasMultimodalContent =
imageParts.length > 0 ||
audioParts.length > 0 ||
videoParts.length > 0 ||
documentParts.length > 0

// Build the content field - use ContentPart[] if multimodal, string otherwise
let content: string | null | Array<ContentPart>
if (hasMultimodalContent) {
content =
uiMessage.parts.filter(
(part): part is ContentPart =>
part.type === 'text' ||
part.type === 'image' ||
part.type === 'audio' ||
part.type === 'video' ||
part.type === 'document',
) || null
} else {
// Text-only: use simple string
content = textParts.map((p) => p.content).join('') || null
}

const toolCalls =
toolCallParts.length > 0
? toolCallParts
Expand Down Expand Up @@ -144,7 +182,7 @@ export function uiMessageToModelMessages(
* Convert a ModelMessage to UIMessage
*
* This conversion creates a parts-based structure:
* - content field β†’ TextPart
* - content field β†’ TextPart (for string) or multimodal MessageParts (for ContentPart[])
* - toolCalls array β†’ ToolCallPart[]
* - role="tool" messages should be converted separately and merged
*
Expand All @@ -158,13 +196,18 @@ export function modelMessageToUIMessage(
): UIMessage {
const parts: Array<MessagePart> = []

// Handle content (convert multimodal content to text for UI)
const textContent = getTextContent(modelMessage.content)
if (textContent) {
parts.push({
type: 'text',
content: textContent,
})
// Handle content - preserve multimodal content
if (modelMessage.content !== null) {
if (typeof modelMessage.content === 'string') {
if (modelMessage.content) {
parts.push({
type: 'text',
content: modelMessage.content,
})
}
} else if (Array.isArray(modelMessage.content)) {
parts.push(...(modelMessage.content as Array<MessagePart>))
}
}

// Handle tool calls
Expand Down
4 changes: 4 additions & 0 deletions packages/typescript/ai/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,10 @@ export type MessagePart =
| ToolCallPart
| ToolResultPart
| ThinkingPart
| ImagePart
| AudioPart
| VideoPart
| DocumentPart

/**
* UIMessage - Domain-specific message format optimized for building chat UIs
Expand Down
190 changes: 190 additions & 0 deletions packages/typescript/ai/tests/messages.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import { describe, expect, it } from 'vitest'
import {
modelMessageToUIMessage,
uiMessageToModelMessages,
} from '../src/activities/chat/messages'
import type { ModelMessage, UIMessage } from '../src/types'

describe('message converters', () => {
describe('modelMessageToUIMessage', () => {
it('should preserve text content', () => {
const modelMessage: ModelMessage = {
role: 'user',
content: 'Hello world',
}

const uiMessage = modelMessageToUIMessage(modelMessage)

expect(uiMessage.parts).toHaveLength(1)
expect(uiMessage.parts[0]).toEqual({
type: 'text',
content: 'Hello world',
})
})

it('should preserve multimodal content', () => {
const modelMessage: ModelMessage = {
role: 'user',
content: [
{ type: 'text', content: 'What is in this image?' },
{
type: 'image',
source: { type: 'url', value: 'https://example.com/image.jpg' },
},
],
}

const uiMessage = modelMessageToUIMessage(modelMessage)

expect(uiMessage.parts).toHaveLength(2)
expect(uiMessage.parts[0]).toEqual({
type: 'text',
content: 'What is in this image?',
})
expect(uiMessage.parts[1]).toEqual({
type: 'image',
source: { type: 'url', value: 'https://example.com/image.jpg' },
})
})

it('should preserve all multimodal types', () => {
const modelMessage: ModelMessage = {
role: 'user',
content: [
{ type: 'text', content: 'Check these:' },
{ type: 'image', source: { type: 'data', value: 'base64img' } },
{
type: 'audio',
source: { type: 'url', value: 'https://example.com/audio.mp3' },
},
{
type: 'video',
source: { type: 'url', value: 'https://example.com/video.mp4' },
},
{ type: 'document', source: { type: 'data', value: 'base64pdf' } },
],
}

const uiMessage = modelMessageToUIMessage(modelMessage)

expect(uiMessage.parts).toHaveLength(5)
expect(uiMessage.parts[0]?.type).toBe('text')
expect(uiMessage.parts[1]?.type).toBe('image')
expect(uiMessage.parts[2]?.type).toBe('audio')
expect(uiMessage.parts[3]?.type).toBe('video')
expect(uiMessage.parts[4]?.type).toBe('document')
})

it('should preserve metadata', () => {
const modelMessage: ModelMessage = {
role: 'user',
content: [
{
type: 'image',
source: { type: 'url', value: 'https://example.com/img.jpg' },
metadata: { detail: 'high' },
},
],
}

const uiMessage = modelMessageToUIMessage(modelMessage)

expect(uiMessage.parts[0]).toEqual({
type: 'image',
source: { type: 'url', value: 'https://example.com/img.jpg' },
metadata: { detail: 'high' },
})
})
})

describe('uiMessageToModelMessages', () => {
it('should convert text-only UIMessage to string content', () => {
const uiMessage: UIMessage = {
id: 'msg-1',
role: 'user',
parts: [{ type: 'text', content: 'Hello world' }],
}

const modelMessages = uiMessageToModelMessages(uiMessage)

expect(modelMessages).toHaveLength(1)
expect(modelMessages[0]?.content).toBe('Hello world')
})

it('should convert multimodal UIMessage to ContentPart array', () => {
const uiMessage: UIMessage = {
id: 'msg-1',
role: 'user',
parts: [
{ type: 'text', content: 'What is this?' },
{
type: 'image',
source: { type: 'url', value: 'https://example.com/img.jpg' },
},
],
}

const modelMessages = uiMessageToModelMessages(uiMessage)

expect(modelMessages).toHaveLength(1)
expect(Array.isArray(modelMessages[0]?.content)).toBe(true)
expect(modelMessages[0]?.content).toHaveLength(2)
})

it('should preserve part order during conversion', () => {
const uiMessage: UIMessage = {
id: 'msg-1',
role: 'user',
parts: [
{
type: 'image',
source: { type: 'url', value: 'https://example.com/1.jpg' },
},
{ type: 'text', content: 'Middle text' },
{
type: 'image',
source: { type: 'url', value: 'https://example.com/2.jpg' },
},
],
}

const modelMessages = uiMessageToModelMessages(uiMessage)
const content = modelMessages[0]?.content as Array<any>

expect(content[0]?.type).toBe('image')
expect(content[1]?.type).toBe('text')
expect(content[2]?.type).toBe('image')
})
})

describe('round-trip conversion', () => {
it('should preserve multimodal content through round-trip', () => {
const original: ModelMessage = {
role: 'user',
content: [
{ type: 'text', content: 'Describe this image' },
{
type: 'image',
source: { type: 'url', value: 'https://example.com/photo.jpg' },
},
],
}

const uiMessage = modelMessageToUIMessage(original)
const [converted] = uiMessageToModelMessages(uiMessage)

expect(converted?.role).toBe('user')
expect(Array.isArray(converted?.content)).toBe(true)
const content = converted?.content as Array<any>
expect(content).toHaveLength(2)
expect(content[0]).toEqual({
type: 'text',
content: 'Describe this image',
})
expect(content[1]).toEqual({
type: 'image',
source: { type: 'url', value: 'https://example.com/photo.jpg' },
})
})
})
})
Loading