diff --git a/packages/core/src/evaluation/evaluators/index.ts b/packages/core/src/evaluation/evaluators/index.ts index 678999bd..62f6041c 100644 --- a/packages/core/src/evaluation/evaluators/index.ts +++ b/packages/core/src/evaluation/evaluators/index.ts @@ -47,6 +47,7 @@ export { buildScoreRangeOutputSchema, calculateRubricScore, DEFAULT_EVALUATOR_TEMPLATE, + extractImageBlocks, substituteVariables, freeformEvaluationSchema, rubricEvaluationSchema, diff --git a/packages/core/src/evaluation/evaluators/llm-grader.ts b/packages/core/src/evaluation/evaluators/llm-grader.ts index 553f0677..05ae50ad 100644 --- a/packages/core/src/evaluation/evaluators/llm-grader.ts +++ b/packages/core/src/evaluation/evaluators/llm-grader.ts @@ -4,7 +4,9 @@ import path from 'node:path'; import { generateText, stepCountIs, tool } from 'ai'; import { z } from 'zod'; -import type { Provider, ProviderResponse } from '../providers/types.js'; +import type { ContentImage } from '../content.js'; +import { isContentArray } from '../content.js'; +import type { Message, Provider, ProviderResponse } from '../providers/types.js'; import { extractLastAssistantContent, isAgentProvider } from '../providers/types.js'; import { DEPRECATED_TEMPLATE_VARIABLES, TEMPLATE_VARIABLES } from '../template-variables.js'; import type { TokenUsage } from '../trace.js'; @@ -242,6 +244,9 @@ export class LlmGraderEvaluator implements Evaluator { systemPrompt, }; + // Extract image blocks from agent output for multimodal grading + const images = context.output ? extractImageBlocks(context.output) : []; + try { const { data, tokenUsage } = await this.runWithRetry({ context, @@ -249,6 +254,7 @@ export class LlmGraderEvaluator implements Evaluator { systemPrompt, userPrompt, schema: freeformEvaluationSchema, + images, }); const score = clampScore(data.score); @@ -309,6 +315,9 @@ export class LlmGraderEvaluator implements Evaluator { systemPrompt, }; + // Extract image blocks from agent output for multimodal grading + const images = context.output ? extractImageBlocks(context.output) : []; + try { const { data, tokenUsage } = await this.runWithRetry({ context, @@ -316,6 +325,7 @@ export class LlmGraderEvaluator implements Evaluator { systemPrompt, userPrompt: prompt, schema: rubricEvaluationSchema, + images, }); const { score, verdict, assertions } = calculateRubricScore(data, rubrics); @@ -361,6 +371,9 @@ export class LlmGraderEvaluator implements Evaluator { systemPrompt, }; + // Extract image blocks from agent output for multimodal grading + const images = context.output ? extractImageBlocks(context.output) : []; + try { const { data, tokenUsage } = await this.runWithRetry({ context, @@ -368,6 +381,7 @@ export class LlmGraderEvaluator implements Evaluator { systemPrompt, userPrompt: prompt, schema: scoreRangeEvaluationSchema, + images, }); const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics); @@ -936,8 +950,9 @@ export class LlmGraderEvaluator implements Evaluator { readonly systemPrompt: string; readonly userPrompt: string; readonly schema: z.ZodSchema; + readonly images?: readonly ContentImage[]; }): Promise<{ data: T; providerResponse?: ProviderResponse; tokenUsage?: TokenUsage }> { - const { context, graderProvider, systemPrompt, userPrompt, schema } = options; + const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options; let lastError: Error | undefined; @@ -946,13 +961,34 @@ export class LlmGraderEvaluator implements Evaluator { // Prefer Vercel AI SDK language model if available. const model = graderProvider.asLanguageModel?.(); if (model) { - const result = await generateText({ - model, - system: systemPrompt, - prompt: userPrompt, + const modelOptions = { ...(this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {}), ...(typeof this.temperature === 'number' ? { temperature: this.temperature } : {}), - }); + }; + + // When images are present, use multi-part messages instead of plain prompt + const hasImages = images && images.length > 0; + const result = hasImages + ? await generateText({ + model, + system: systemPrompt, + messages: [ + { + role: 'user' as const, + content: [ + { type: 'text' as const, text: userPrompt }, + ...toAiSdkImageParts(images), + ], + }, + ], + ...modelOptions, + }) + : await generateText({ + model, + system: systemPrompt, + prompt: userPrompt, + ...modelOptions, + }); const data = schema.parse(parseJsonFromText(result.text)); const rawUsage = result.usage; @@ -1200,6 +1236,48 @@ function calculateScoreRangeResult( }; } +// --------------------------------------------------------------------------- +// Multimodal helpers — extract image blocks from agent output messages +// --------------------------------------------------------------------------- + +/** + * Extract all `ContentImage` blocks from assistant messages. + * + * Scans `messages` for assistant-role entries whose `content` is a `Content[]` + * array and collects every `ContentImage` block. Non-assistant messages and + * plain-string content are skipped. + */ +export function extractImageBlocks(messages: readonly Message[]): ContentImage[] { + const images: ContentImage[] = []; + for (const msg of messages) { + if (msg.role !== 'assistant') continue; + if (!isContentArray(msg.content)) continue; + for (const block of msg.content) { + if (block.type === 'image') { + images.push(block); + } + } + } + return images; +} + +/** + * Convert AgentV `ContentImage` blocks to Vercel AI SDK image content parts. + * + * The AI SDK `ImagePart` expects `{ type: 'image', image: string | URL, mediaType?: string }`. + * `ContentImage.source` may be a URL, data URI, or base64 string — all are passed through + * as the `image` field which the SDK handles natively. + */ +function toAiSdkImageParts( + images: readonly ContentImage[], +): Array<{ type: 'image'; image: string; mediaType?: string }> { + return images.map((img) => ({ + type: 'image' as const, + image: img.source, + mediaType: img.media_type || undefined, + })); +} + // --------------------------------------------------------------------------- // Sandboxed filesystem tools for built-in agent mode // --------------------------------------------------------------------------- diff --git a/packages/core/test/evaluation/llm-grader-multimodal.test.ts b/packages/core/test/evaluation/llm-grader-multimodal.test.ts new file mode 100644 index 00000000..1db1c417 --- /dev/null +++ b/packages/core/test/evaluation/llm-grader-multimodal.test.ts @@ -0,0 +1,356 @@ +/** + * Tests for LLM grader multimodal support — auto-appending image content blocks + * from agent output to the judge message. + * + * Verifies: + * - Images from assistant messages are extracted and sent to the judge + * - Text-only output is unchanged (backward compatible) + * - Multiple images are all appended + * - Images in non-assistant messages are ignored + */ + +import { describe, expect, it, mock, beforeEach } from 'bun:test'; + +import type { Message } from '../../src/evaluation/providers/types.js'; +import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js'; +import type { EvalTest } from '../../src/evaluation/types.js'; + +// --------------------------------------------------------------------------- +// Mock generateText to capture what the LLM grader sends to the judge. +// Must be set up before importing the module under test. +// --------------------------------------------------------------------------- + +let capturedGenerateTextArgs: Record | undefined; + +function graderJsonResponse(score: number): string { + return JSON.stringify({ + score, + assertions: [{ text: 'Checked output', passed: score >= 0.5 }], + }); +} + +mock.module('ai', () => { + const actual = require('ai'); + return { + ...actual, + generateText: mock(async (args: Record) => { + capturedGenerateTextArgs = args; + return { + text: graderJsonResponse(0.85), + usage: { inputTokens: 10, outputTokens: 20 }, + finishReason: 'stop', + response: { id: 'test', timestamp: new Date(), modelId: 'test' }, + }; + }), + }; +}); + +// Import AFTER mock is set up +const { extractImageBlocks } = await import('../../src/evaluation/evaluators/llm-grader.js'); +const { LlmGraderEvaluator } = await import('../../src/evaluation/evaluators.js'); + +// --------------------------------------------------------------------------- +// Test helpers +// --------------------------------------------------------------------------- + +const baseTestCase: EvalTest = { + id: 'mm-case-1', + dataset: 'test-dataset', + question: 'Describe the image', + input: [{ role: 'user', content: 'What is in this image?' }], + expected_output: [], + reference_answer: 'A cat sitting on a mat', + file_paths: [], + criteria: 'Accurately describes image content', + evaluator: 'llm-grader', +}; + +const baseTarget: ResolvedTarget = { + kind: 'mock', + name: 'mock', + config: { response: '{}' }, +}; + +/** + * Creates a provider with a fake asLanguageModel() that returns a sentinel + * object. The actual model behavior is handled by the mocked generateText. + */ +function createLmProvider() { + const fakeModel = { modelId: 'test-model', provider: 'test' }; + return { + id: 'test-lm', + kind: 'mock' as const, + targetName: 'test-lm', + invoke: mock(async () => ({ output: [] })), + asLanguageModel: () => fakeModel as never, + }; +} + +// --------------------------------------------------------------------------- +// extractImageBlocks unit tests +// --------------------------------------------------------------------------- + +describe('extractImageBlocks', () => { + it('returns empty array when no messages', () => { + expect(extractImageBlocks([])).toEqual([]); + }); + + it('returns empty array when messages have only string content', () => { + const messages: Message[] = [ + { role: 'user', content: 'hello' }, + { role: 'assistant', content: 'world' }, + ]; + expect(extractImageBlocks(messages)).toEqual([]); + }); + + it('extracts images from assistant messages with Content[] content', () => { + const messages: Message[] = [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'Here is the result' }, + { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,abc123' }, + ], + }, + ]; + const images = extractImageBlocks(messages); + expect(images).toHaveLength(1); + expect(images[0]).toEqual({ + type: 'image', + media_type: 'image/png', + source: 'data:image/png;base64,abc123', + }); + }); + + it('extracts multiple images across multiple assistant messages', () => { + const messages: Message[] = [ + { + role: 'assistant', + content: [ + { type: 'image', media_type: 'image/png', source: 'https://example.com/img1.png' }, + ], + }, + { + role: 'assistant', + content: [ + { type: 'text', text: 'Another response' }, + { type: 'image', media_type: 'image/jpeg', source: 'data:image/jpeg;base64,xyz789' }, + { type: 'image', media_type: 'image/webp', source: 'https://example.com/img2.webp' }, + ], + }, + ]; + const images = extractImageBlocks(messages); + expect(images).toHaveLength(3); + expect(images[0].source).toBe('https://example.com/img1.png'); + expect(images[1].source).toBe('data:image/jpeg;base64,xyz789'); + expect(images[2].source).toBe('https://example.com/img2.webp'); + }); + + it('ignores images in non-assistant messages', () => { + const messages: Message[] = [ + { + role: 'user', + content: [ + { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,user-img' }, + ], + }, + { + role: 'assistant', + content: [ + { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,asst-img' }, + ], + }, + { + role: 'tool', + content: [ + { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,tool-img' }, + ], + }, + ]; + const images = extractImageBlocks(messages); + expect(images).toHaveLength(1); + expect(images[0].source).toBe('data:image/png;base64,asst-img'); + }); + + it('ignores file content blocks (only extracts images)', () => { + const messages: Message[] = [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'Result' }, + { type: 'file', media_type: 'application/pdf', path: '/docs/doc.pdf' }, + { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,abc' }, + ], + }, + ]; + const images = extractImageBlocks(messages); + expect(images).toHaveLength(1); + expect(images[0].type).toBe('image'); + }); +}); + +// --------------------------------------------------------------------------- +// LLM grader multimodal integration tests +// --------------------------------------------------------------------------- + +describe('LlmGraderEvaluator multimodal', () => { + beforeEach(() => { + capturedGenerateTextArgs = undefined; + }); + + it('sends plain text prompt when output has no images', async () => { + const provider = createLmProvider(); + + const evaluator = new LlmGraderEvaluator({ + resolveGraderProvider: async () => provider, + }); + + const result = await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'A cat on a mat', + target: baseTarget, + provider, + attempt: 0, + promptInputs: { question: 'Describe the image' }, + now: new Date(), + output: [{ role: 'assistant', content: 'A cat on a mat' }], + }); + + expect(result.score).toBe(0.85); + expect(capturedGenerateTextArgs).toBeDefined(); + + // When no images, generateText should receive `prompt` (string), not `messages` + expect(capturedGenerateTextArgs!.prompt).toBeTypeOf('string'); + expect(capturedGenerateTextArgs!.messages).toBeUndefined(); + }); + + it('sends multi-part messages when output contains images', async () => { + const provider = createLmProvider(); + + const evaluator = new LlmGraderEvaluator({ + resolveGraderProvider: async () => provider, + }); + + const outputMessages: Message[] = [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'Here is a cat' }, + { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,CATIMAGE' }, + ], + }, + ]; + + const result = await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'Here is a cat', + target: baseTarget, + provider, + attempt: 0, + promptInputs: { question: 'Describe the image' }, + now: new Date(), + output: outputMessages, + }); + + expect(result.score).toBe(0.85); + expect(capturedGenerateTextArgs).toBeDefined(); + + // When images exist, generateText should receive `messages` with multi-part content + expect(capturedGenerateTextArgs!.messages).toBeDefined(); + expect(capturedGenerateTextArgs!.prompt).toBeUndefined(); + + const messages = capturedGenerateTextArgs!.messages as Array>; + expect(messages).toHaveLength(1); + expect(messages[0].role).toBe('user'); + + const content = messages[0].content as Array>; + + // Should contain text part + image part + const textParts = content.filter((p) => p.type === 'text'); + const imageParts = content.filter((p) => p.type === 'image'); + + expect(textParts.length).toBeGreaterThanOrEqual(1); + expect(imageParts).toHaveLength(1); + + // Verify image data is passed through + expect(imageParts[0].image).toBe('data:image/png;base64,CATIMAGE'); + expect(imageParts[0].mediaType).toBe('image/png'); + }); + + it('appends multiple images from output', async () => { + const provider = createLmProvider(); + + const evaluator = new LlmGraderEvaluator({ + resolveGraderProvider: async () => provider, + }); + + const outputMessages: Message[] = [ + { + role: 'assistant', + content: [ + { type: 'text', text: 'Two images' }, + { type: 'image', media_type: 'image/png', source: 'https://example.com/img1.png' }, + { type: 'image', media_type: 'image/jpeg', source: 'data:image/jpeg;base64,IMG2DATA' }, + ], + }, + ]; + + await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'Two images', + target: baseTarget, + provider, + attempt: 0, + promptInputs: { question: 'Describe the images' }, + now: new Date(), + output: outputMessages, + }); + + expect(capturedGenerateTextArgs).toBeDefined(); + const messages = capturedGenerateTextArgs!.messages as Array>; + const content = messages[0].content as Array>; + + const imageParts = content.filter((p) => p.type === 'image'); + expect(imageParts).toHaveLength(2); + expect(imageParts[0].image).toBe('https://example.com/img1.png'); + expect(imageParts[1].image).toBe('data:image/jpeg;base64,IMG2DATA'); + }); + + it('ignores images in user/tool messages (only assistant)', async () => { + const provider = createLmProvider(); + + const evaluator = new LlmGraderEvaluator({ + resolveGraderProvider: async () => provider, + }); + + const outputMessages: Message[] = [ + { + role: 'user', + content: [ + { type: 'image', media_type: 'image/png', source: 'data:image/png;base64,USERIMG' }, + ], + }, + { + role: 'assistant', + content: 'Just text, no images', + }, + ]; + + await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'Just text, no images', + target: baseTarget, + provider, + attempt: 0, + promptInputs: { question: 'Describe' }, + now: new Date(), + output: outputMessages, + }); + + expect(capturedGenerateTextArgs).toBeDefined(); + + // No images in assistant messages → should use plain prompt + expect(capturedGenerateTextArgs!.prompt).toBeTypeOf('string'); + expect(capturedGenerateTextArgs!.messages).toBeUndefined(); + }); +});