From cc34f6a88d2af1573c748dff6fe38b3e01db355f Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 02:48:47 +0000 Subject: [PATCH 1/6] =?UTF-8?q?feat(eval):=20simplify=20template=20variabl?= =?UTF-8?q?es=20=E2=80=94=20drop=20=5Ftext=20suffix,=20align=20with=20indu?= =?UTF-8?q?stry=20patterns?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - {{output}}, {{input}}, {{expected_output}} now resolve to human-readable text instead of JSON.stringify'd message arrays - Deprecated _text aliases ({{input_text}}, {{output_text}}, {{expected_output_text}}) still work but emit a stderr warning - Removed outputText, inputText, expectedOutputText from CodeGraderInput schema — code graders should extract text from Message.content using getTextContent() from @agentv/core - Removed EnrichedCodeGraderInput type (no longer needed) - Updated default evaluator template to use new variable names - Updated prompt-validator to accept both new and deprecated variable names Closes #825 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../evaluation/evaluators/code-evaluator.ts | 3 - .../evaluators/llm-grader-prompt.ts | 7 +- .../src/evaluation/evaluators/llm-grader.ts | 68 +++++++++++--- .../evaluators/prompt-resolution.ts | 3 - .../core/src/evaluation/template-variables.ts | 27 +++++- .../evaluation/validation/prompt-validator.ts | 28 +++++- .../core/test/evaluation/evaluators.test.ts | 4 + .../evaluation/evaluators_variables.test.ts | 72 +++++++++------ .../loaders/evaluator-parser.test.ts | 4 +- .../core/test/evaluation/orchestrator.test.ts | 16 +++- .../core/test/fixtures/test-define-grader.ts | 7 +- .../fixtures/test-grader-with-details.cjs | 5 +- packages/core/test/fixtures/test-grader.cjs | 7 +- packages/eval/src/assertion.ts | 13 +-- packages/eval/src/deprecation.ts | 24 ++--- packages/eval/src/index.ts | 77 +++++----------- packages/eval/src/prompt-template.ts | 49 +++------- packages/eval/src/runtime.ts | 12 +-- packages/eval/src/schemas.ts | 24 +---- packages/eval/test/define-code-grader.test.ts | 9 +- .../eval/test/define-prompt-template.test.ts | 19 ---- packages/eval/test/deprecation.test.ts | 91 +------------------ packages/eval/test/file-backed-output.test.ts | 6 -- 23 files changed, 247 insertions(+), 328 deletions(-) diff --git a/packages/core/src/evaluation/evaluators/code-evaluator.ts b/packages/core/src/evaluation/evaluators/code-evaluator.ts index a1cecc08c..c2410924b 100644 --- a/packages/core/src/evaluation/evaluators/code-evaluator.ts +++ b/packages/core/src/evaluation/evaluators/code-evaluator.ts @@ -64,7 +64,6 @@ export class CodeEvaluator implements Evaluator { const payload = { criteria: context.evalCase.criteria, expectedOutput: context.evalCase.expected_output, - outputText: context.candidate, output: outputForPayload, outputPath, inputFiles: context.evalCase.file_paths, @@ -78,8 +77,6 @@ export class CodeEvaluator implements Evaluator { fileChanges: context.fileChanges ?? null, workspacePath: context.workspacePath ?? null, config: this.config ?? null, - inputText: context.evalCase.question, - expectedOutputText: context.evalCase.reference_answer ?? '', }; const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2); diff --git a/packages/core/src/evaluation/evaluators/llm-grader-prompt.ts b/packages/core/src/evaluation/evaluators/llm-grader-prompt.ts index 1a3d26bee..b8d80feff 100644 --- a/packages/core/src/evaluation/evaluators/llm-grader-prompt.ts +++ b/packages/core/src/evaluation/evaluators/llm-grader-prompt.ts @@ -68,11 +68,12 @@ function assembleFreeform( : evalCase.question; const variables = { - [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2), - [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2), + [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? '').trim(), [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? '', + // Deprecated aliases [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? '').trim(), diff --git a/packages/core/src/evaluation/evaluators/llm-grader.ts b/packages/core/src/evaluation/evaluators/llm-grader.ts index abd550398..553f06774 100644 --- a/packages/core/src/evaluation/evaluators/llm-grader.ts +++ b/packages/core/src/evaluation/evaluators/llm-grader.ts @@ -6,7 +6,7 @@ import { z } from 'zod'; import type { Provider, ProviderResponse } from '../providers/types.js'; import { extractLastAssistantContent, isAgentProvider } from '../providers/types.js'; -import { TEMPLATE_VARIABLES } from '../template-variables.js'; +import { DEPRECATED_TEMPLATE_VARIABLES, TEMPLATE_VARIABLES } from '../template-variables.js'; import type { TokenUsage } from '../trace.js'; import type { AssertionEntry, JsonObject, RubricItem } from '../types.js'; import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js'; @@ -74,13 +74,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r {{${TEMPLATE_VARIABLES.CRITERIA}}} [[ ## question ## ]] -{{${TEMPLATE_VARIABLES.INPUT_TEXT}}} +{{${TEMPLATE_VARIABLES.INPUT}}} [[ ## reference_answer ## ]] -{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}} +{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT}}} [[ ## answer ## ]] -{{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`; +{{${TEMPLATE_VARIABLES.OUTPUT}}}`; type GraderProviderResolver = (context: EvaluationContext) => Promise; @@ -206,17 +206,15 @@ export class LlmGraderEvaluator implements Evaluator { ? context.promptInputs.question : context.evalCase.question; - // Prepare template variables for substitution + // Prepare template variables for substitution. + // Primary variables resolve to human-readable text; deprecated _text aliases map to the same values. const variables = { - [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input, null, 2), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify( - context.evalCase.expected_output, - null, - 2, - ), - [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context.output ?? [], null, 2), + [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + // Deprecated aliases — same values as the primary variables above [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(), @@ -228,6 +226,10 @@ export class LlmGraderEvaluator implements Evaluator { // Build user prompt based on custom template or default template const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE; + + // Warn once per run when custom templates use deprecated _text variable names + warnDeprecatedTemplateVars(evaluatorTemplate); + let userPrompt = substituteVariables(evaluatorTemplate, variables); // Append file_changes section to default template only when present @@ -615,13 +617,18 @@ export class LlmGraderEvaluator implements Evaluator { const variables: Record = { [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), + [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), + [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + // Deprecated aliases [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', }; if (this.evaluatorTemplate) { + warnDeprecatedTemplateVars(this.evaluatorTemplate); return substituteVariables(this.evaluatorTemplate, variables); } @@ -685,11 +692,16 @@ export class LlmGraderEvaluator implements Evaluator { if (this.evaluatorTemplate) { const variables: Record = { [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), + [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), + [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + // Deprecated aliases [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', }; + warnDeprecatedTemplateVars(this.evaluatorTemplate); const customPrompt = substituteVariables(this.evaluatorTemplate, variables); const outputSchema = @@ -1018,6 +1030,34 @@ export function substituteVariables(template: string, variables: Record(); + +/** + * Emit a one-time stderr warning when a template uses deprecated _text variable names. + * Skips the default template (which uses the new names and should never trigger warnings). + */ +export function warnDeprecatedTemplateVars(template: string): void { + if (warnedTemplateStrings.has(template)) return; + + const used: string[] = []; + for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) { + if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) { + used.push(`{{ ${deprecated} }} → {{ ${replacement} }}`); + } + } + + if (used.length > 0) { + warnedTemplateStrings.add(template); + console.warn( + `${ANSI_YELLOW}⚠ Deprecated template variables detected (they still work but will be removed in a future version):\n ${used.join('\n ')}\n Update your custom evaluator template to use the new names.${ANSI_RESET}`, + ); + } +} + export function calculateRubricScore( result: z.infer, rubrics: readonly RubricItem[], diff --git a/packages/core/src/evaluation/evaluators/prompt-resolution.ts b/packages/core/src/evaluation/evaluators/prompt-resolution.ts index 7c20387d5..5429e62ab 100644 --- a/packages/core/src/evaluation/evaluators/prompt-resolution.ts +++ b/packages/core/src/evaluation/evaluators/prompt-resolution.ts @@ -75,7 +75,6 @@ async function executePromptTemplate( const payload = { criteria: context.evalCase.criteria, expectedOutput: context.evalCase.expected_output, - outputText: context.candidate, output: context.output ?? null, inputFiles: context.evalCase.file_paths, input: context.evalCase.input, @@ -83,8 +82,6 @@ async function executePromptTemplate( fileChanges: context.fileChanges ?? null, workspacePath: context.workspacePath ?? null, config: config ?? context.config ?? null, - inputText: context.evalCase.question, - expectedOutputText: context.evalCase.reference_answer ?? '', }; const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2); diff --git a/packages/core/src/evaluation/template-variables.ts b/packages/core/src/evaluation/template-variables.ts index a429a2c11..31d289145 100644 --- a/packages/core/src/evaluation/template-variables.ts +++ b/packages/core/src/evaluation/template-variables.ts @@ -1,6 +1,18 @@ /** * Template variable constants for evaluator prompts. * These variables can be used in custom evaluator templates with {{ variable_name }} syntax. + * + * Primary variables: + * - {{ input }} — input as plain text (single-turn) or role-prefixed conversation (multi-turn) + * - {{ output }} — last assistant message as plain text + * - {{ expected_output }} — reference answer as plain text + * - {{ criteria }} — evaluation criteria string + * - {{ file_changes }} — file diff (if available) + * + * Deprecated aliases (emit a warning when used in custom templates): + * - {{ input_text }} → use {{ input }} + * - {{ output_text }} → use {{ output }} + * - {{ expected_output_text }} → use {{ expected_output }} */ export const TEMPLATE_VARIABLES = { EXPECTED_OUTPUT: 'expected_output', @@ -8,8 +20,11 @@ export const TEMPLATE_VARIABLES = { INPUT: 'input', OUTPUT: 'output', FILE_CHANGES: 'file_changes', + /** @deprecated Use INPUT instead — resolves to the same text value. */ INPUT_TEXT: 'input_text', + /** @deprecated Use OUTPUT instead — resolves to the same text value. */ OUTPUT_TEXT: 'output_text', + /** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */ EXPECTED_OUTPUT_TEXT: 'expected_output_text', } as const; @@ -28,6 +43,16 @@ export const VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_V * At least one of these should be present in a custom evaluator template. */ export const REQUIRED_TEMPLATE_VARIABLES = new Set([ - TEMPLATE_VARIABLES.OUTPUT_TEXT, + TEMPLATE_VARIABLES.OUTPUT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT, ]); + +/** + * Deprecated template variable names that still work but trigger a warning. + * Maps deprecated name → replacement name. + */ +export const DEPRECATED_TEMPLATE_VARIABLES: ReadonlyMap = new Map([ + [TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT], + [TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT], + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT], +]); diff --git a/packages/core/src/evaluation/validation/prompt-validator.ts b/packages/core/src/evaluation/validation/prompt-validator.ts index 9f7ccf914..d3d141388 100644 --- a/packages/core/src/evaluation/validation/prompt-validator.ts +++ b/packages/core/src/evaluation/validation/prompt-validator.ts @@ -1,6 +1,10 @@ import { readFile } from 'node:fs/promises'; -import { TEMPLATE_VARIABLES, VALID_TEMPLATE_VARIABLES } from '../template-variables.js'; +import { + DEPRECATED_TEMPLATE_VARIABLES, + TEMPLATE_VARIABLES, + VALID_TEMPLATE_VARIABLES, +} from '../template-variables.js'; const ANSI_YELLOW = '\u001b[33m'; const ANSI_RESET = '\u001b[0m'; @@ -36,15 +40,31 @@ export function validateTemplateVariables(content: string, source: string): void match = variablePattern.exec(content); } - // Check if template contains required variables for evaluation - const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT); + // Check if template contains required variables for evaluation. + // Accept both new names (output, expected_output) and deprecated aliases (output_text, expected_output_text). + const hasCandidateAnswer = + foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || + foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT); const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT); const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput; // ERROR: Missing required fields - throw error to skip this evaluator/eval case if (!hasRequiredFields) { throw new Error( - `Missing required fields. Must include at least one of:\n - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}\n - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`, + `Missing required fields. Must include at least one of:\n - {{ ${TEMPLATE_VARIABLES.OUTPUT} }}\n - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`, + ); + } + + // WARNING: Deprecated variables - show warning but continue + const deprecatedUsed: string[] = []; + for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) { + if (foundVariables.has(deprecated)) { + deprecatedUsed.push(`{{ ${deprecated} }} → {{ ${replacement} }}`); + } + } + if (deprecatedUsed.length > 0) { + console.warn( + `${ANSI_YELLOW}Warning: Template at ${source} uses deprecated variable names:\n ${deprecatedUsed.join('\n ')}\n These still work but will be removed in a future version.${ANSI_RESET}`, ); } diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts index df7ef94b8..a7b92afa4 100644 --- a/packages/core/test/evaluation/evaluators.test.ts +++ b/packages/core/test/evaluation/evaluators.test.ts @@ -715,6 +715,7 @@ describe('CodeEvaluator', () => { const result = await evaluator.evaluate({ evalCase: evalCaseWithExpectedMessages, candidate: expectedCandidate, + output: [{ role: 'assistant', content: '{"decision":"ACCEPT"}' }], target: baseTarget, provider: graderProvider, attempt: 0, @@ -765,6 +766,7 @@ describe('CodeEvaluator', () => { const result = await evaluator.evaluate({ evalCase: baseTestCase, candidate: 'Added logging to the implementation', + output: [{ role: 'assistant', content: 'Added logging to the implementation' }], target: baseTarget, provider: graderProvider, attempt: 0, @@ -791,6 +793,7 @@ describe('CodeEvaluator', () => { expected_output: [{ role: 'assistant', content: 'test' }], }, candidate: 'Test candidate', + output: [{ role: 'assistant', content: 'Test candidate' }], target: baseTarget, provider: graderProvider, attempt: 0, @@ -848,6 +851,7 @@ describe('CodeEvaluator', () => { expected_output: [{ role: 'assistant', content: { decision: 'ACCEPT' } }], }, candidate: '{"decision":"ACCEPT"}', + output: [{ role: 'assistant', content: '{"decision":"ACCEPT"}' }], target: baseTarget, provider: graderProvider, attempt: 0, diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts index 6b47e2941..5eeda30de 100644 --- a/packages/core/test/evaluation/evaluators_variables.test.ts +++ b/packages/core/test/evaluation/evaluators_variables.test.ts @@ -45,12 +45,10 @@ describe('LlmGraderEvaluator Variable Substitution', () => { it('substitutes template variables in custom prompt', async () => { const formattedQuestion = '@[User]: What is the status?\n\n@[Assistant]: Requesting more info.'; const customPrompt = ` -Question: {{input_text}} +Question: {{input}} Outcome: {{criteria}} -Reference: {{expected_output_text}} -Candidate: {{output_text}} -Input Messages: {{input}} -Expected Messages: {{expected_output}} +Reference: {{expected_output}} +Candidate: {{output}} File Changes: {{file_changes}} `; @@ -82,22 +80,13 @@ File Changes: {{file_changes}} const request = graderProvider.lastRequest; expect(request).toBeDefined(); - // When custom evaluatorTemplate is provided, it goes in the user prompt (question) - // System prompt only contains the output schema + // Primary variables resolve to human-readable text expect(request?.question).toContain(`Question: ${formattedQuestion}`); expect(request?.question).not.toContain('Original Question Text'); expect(request?.question).toContain('Outcome: Expected Outcome Text'); expect(request?.question).toContain('Reference: Reference Answer Text'); expect(request?.question).toContain('Candidate: Candidate Answer Text'); - // Verify input JSON stringification - expect(request?.question).toContain('Input Messages: ['); - expect(request?.question).toContain('"value": "Input Message"'); - - // Verify expected_output JSON stringification - expect(request?.question).toContain('Expected Messages: ['); - expect(request?.question).toContain('"value": "Expected Output Message"'); - // Verify file_changes substitution expect(request?.question).toContain('File Changes: diff --git a/test.txt b/test.txt'); expect(request?.question).toContain('+added line'); @@ -107,6 +96,45 @@ File Changes: {{file_changes}} expect(request?.systemPrompt).not.toContain(`Question: ${formattedQuestion}`); }); + it('deprecated _text aliases still resolve correctly', async () => { + const formattedQuestion = 'What is 2+2?'; + const customPrompt = ` +Question: {{input_text}} +Reference: {{expected_output_text}} +Candidate: {{output_text}} +`; + + const graderProvider = new CapturingProvider({ + text: JSON.stringify({ + score: 0.9, + assertions: [{ text: 'OK', passed: true }], + }), + }); + + const evaluator = new LlmGraderEvaluator({ + resolveGraderProvider: async () => graderProvider, + evaluatorTemplate: customPrompt, + }); + + await evaluator.evaluate({ + evalCase: { ...baseTestCase, evaluator: 'llm-grader' }, + candidate: 'Four', + target: baseTarget, + provider: graderProvider, + attempt: 0, + promptInputs: { question: formattedQuestion }, + now: new Date(), + }); + + const request = graderProvider.lastRequest; + expect(request).toBeDefined(); + + // Deprecated aliases resolve to the same text values as the primary variables + expect(request?.question).toContain(`Question: ${formattedQuestion}`); + expect(request?.question).toContain('Reference: Reference Answer Text'); + expect(request?.question).toContain('Candidate: Four'); + }); + it('does not substitute if no variables are present', async () => { const customPrompt = 'Fixed prompt without variables'; const promptQuestion = 'Summarize the latest logs without markers.'; @@ -143,12 +171,10 @@ File Changes: {{file_changes}} it('substitutes template variables with whitespace inside braces', async () => { const formattedQuestion = 'What is the status?'; const customPrompt = ` -Question: {{ input_text }} +Question: {{ input }} Outcome: {{ criteria }} -Reference: {{ expected_output_text }} -Candidate: {{ output_text }} -Input Messages: {{ input }} -Expected Messages: {{ expected_output }} +Reference: {{ expected_output }} +Candidate: {{ output }} `; const graderProvider = new CapturingProvider({ @@ -184,12 +210,6 @@ Expected Messages: {{ expected_output }} expect(request?.question).toContain('Reference: Reference Answer Text'); expect(request?.question).toContain('Candidate: Candidate Answer Text'); - // Verify JSON stringified variables were also substituted - expect(request?.question).toContain('Input Messages: ['); - expect(request?.question).toContain('"value": "Input Message"'); - expect(request?.question).toContain('Expected Messages: ['); - expect(request?.question).toContain('"value": "Expected Output Message"'); - // Verify no unreplaced template markers remain expect(request?.question).not.toMatch(/\{\{\s*\w+\s*\}\}/); }); diff --git a/packages/core/test/evaluation/loaders/evaluator-parser.test.ts b/packages/core/test/evaluation/loaders/evaluator-parser.test.ts index 0e76f15d3..0c56c86a5 100644 --- a/packages/core/test/evaluation/loaders/evaluator-parser.test.ts +++ b/packages/core/test/evaluation/loaders/evaluator-parser.test.ts @@ -1614,8 +1614,8 @@ describe('parseEvaluators - composite assertions field', () => { tempDir = path.join(os.tmpdir(), `agentv-test-composite-assert-${Date.now()}`); await mkdir(tempDir, { recursive: true }); // Create dummy prompt files for llm-grader members (must include required template fields) - await writeFile(path.join(tempDir, 'safety.md'), 'Evaluate safety of {{ output_text }}'); - await writeFile(path.join(tempDir, 'quality.md'), 'Evaluate quality of {{ output_text }}'); + await writeFile(path.join(tempDir, 'safety.md'), 'Evaluate safety of {{ output }}'); + await writeFile(path.join(tempDir, 'quality.md'), 'Evaluate quality of {{ output }}'); }); afterAll(async () => { diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index fed09e1db..fb46f8fe9 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -975,9 +975,12 @@ describe('runEvalCase trace integration', () => { `import { readFileSync } from 'fs'; const stdin = readFileSync(0, 'utf8'); const input = JSON.parse(stdin); -console.log(\`Question: \${input.input_text} -Answer: \${input.output_text} -Reference: \${input.expected_output_text ?? 'none'}\`); +const question = (input.input || []).map((m) => String(m.content ?? '')).join('\\n'); +const answer = (input.output || []).map((m) => String(m.content ?? '')).join('\\n'); +const ref = (input.expected_output || []).map((m) => String(m.content ?? '')).join('\\n') || 'none'; +console.log(\`Question: \${question} +Answer: \${answer} +Reference: \${ref}\`); `, ); @@ -1009,7 +1012,9 @@ Reference: \${input.expected_output_text ?? 'none'}\`); evalCase: { ...baseTestCase, question: 'What is 2+2?', + input: [{ role: 'user', content: 'What is 2+2?' }], reference_answer: 'The sum is 4', + expected_output: [{ role: 'assistant', content: 'The sum is 4' }], assertions: [ { name: 'ts-prompt-eval', @@ -1040,7 +1045,9 @@ Reference: \${input.expected_output_text ?? 'none'}\`); `const fs = require('fs'); const stdin = fs.readFileSync(0, 'utf8'); const input = JSON.parse(stdin); -console.log('Question: ' + input.input_text + '\\nAnswer: ' + input.output_text); +const question = (input.input || []).map((m) => String(m.content || '')).join('\\n'); +const answer = (input.output || []).map((m) => String(m.content || '')).join('\\n'); +console.log('Question: ' + question + '\\nAnswer: ' + answer); `, ); @@ -1070,6 +1077,7 @@ console.log('Question: ' + input.input_text + '\\nAnswer: ' + input.output_text) evalCase: { ...baseTestCase, question: 'Test question', + input: [{ role: 'user', content: 'Test question' }], assertions: [ { name: 'js-prompt-eval', diff --git a/packages/core/test/fixtures/test-define-grader.ts b/packages/core/test/fixtures/test-define-grader.ts index 820d48bc1..f5c41f75d 100644 --- a/packages/core/test/fixtures/test-define-grader.ts +++ b/packages/core/test/fixtures/test-define-grader.ts @@ -4,12 +4,15 @@ */ import { defineCodeGrader } from '../../../eval/src/index.js'; -export default defineCodeGrader(({ outputText, criteria }) => { +export default defineCodeGrader(({ output, criteria }) => { const assertions: { text: string; passed: boolean }[] = []; + // Extract text from the output message array + const candidateText = (output ?? []).map((m) => String(m.content ?? '')).join(' '); + // Simple check: does candidate mention the criteria keywords? const outcomeWords = criteria.toLowerCase().split(/\s+/); - const candidateWords = outputText.toLowerCase().split(/\s+/); + const candidateWords = candidateText.toLowerCase().split(/\s+/); for (const word of outcomeWords) { if (word.length > 3 && candidateWords.includes(word)) { diff --git a/packages/core/test/fixtures/test-grader-with-details.cjs b/packages/core/test/fixtures/test-grader-with-details.cjs index e3ce45923..b11c34d36 100644 --- a/packages/core/test/fixtures/test-grader-with-details.cjs +++ b/packages/core/test/fixtures/test-grader-with-details.cjs @@ -7,7 +7,10 @@ const fs = require('node:fs'); const input = JSON.parse(fs.readFileSync(0, 'utf8')); const hasExpected = Array.isArray(input.expected_output); -const hasCandidate = typeof input.output_text === 'string'; +// Extract candidate text from the output message array +const outputMessages = Array.isArray(input.output) ? input.output : []; +const candidateText = outputMessages.map((m) => String(m.content ?? '')).join(''); +const hasCandidate = candidateText.length > 0; // Emit details with structured metrics console.log( diff --git a/packages/core/test/fixtures/test-grader.cjs b/packages/core/test/fixtures/test-grader.cjs index 4b049b1c2..a8957844c 100644 --- a/packages/core/test/fixtures/test-grader.cjs +++ b/packages/core/test/fixtures/test-grader.cjs @@ -4,11 +4,14 @@ const fs = require('node:fs'); const input = JSON.parse(fs.readFileSync(0, 'utf8')); const hasExpected = Array.isArray(input.expected_output); -const hasCandidate = typeof input.output_text === 'string'; +// Extract candidate text from the output message array +const outputMessages = Array.isArray(input.output) ? input.output : []; +const candidateText = outputMessages.map((m) => (typeof m.content === 'string' ? m.content : JSON.stringify(m.content))).join(''); +const hasCandidate = candidateText.length > 0; let candidateDecisionOk = false; try { - const obj = JSON.parse(input.output_text); + const obj = JSON.parse(candidateText); candidateDecisionOk = obj && obj.decision === 'ACCEPT'; } catch {} diff --git a/packages/eval/src/assertion.ts b/packages/eval/src/assertion.ts index e69e9e625..1d654f329 100644 --- a/packages/eval/src/assertion.ts +++ b/packages/eval/src/assertion.ts @@ -14,17 +14,12 @@ import { CodeGraderInputSchema, type CodeGraderResult, CodeGraderResultSchema, - type EnrichedCodeGraderInput, } from './schemas.js'; /** * Context provided to assertion handlers. - * - * Same shape as CodeGraderInput but with `inputText`, `outputText`, and - * `expectedOutputText` guaranteed to be strings (populated by the runtime - * before the handler is called). */ -export type AssertionContext = EnrichedCodeGraderInput; +export type AssertionContext = CodeGraderInput; /** * Known built-in assertion types. Custom types are extensible via string. @@ -193,11 +188,11 @@ export async function runAssertion(handler: AssertionHandler): Promise { }); } - // Enrich input with text accessors and deprecation warnings + // Enrich input — no-op pass-through enrichInput(input); - // After enrichment, text accessors are guaranteed to be strings - const rawResult = await handler(input as EnrichedCodeGraderInput); + // Run handler + const rawResult = await handler(input); const normalized = normalizeScore(rawResult); const result = CodeGraderResultSchema.parse(normalized); console.log(JSON.stringify(result, null, 2)); diff --git a/packages/eval/src/deprecation.ts b/packages/eval/src/deprecation.ts index 735cdc508..35d80939f 100644 --- a/packages/eval/src/deprecation.ts +++ b/packages/eval/src/deprecation.ts @@ -1,26 +1,20 @@ /** * Input enrichment utilities for code grader and assertion runtimes. - * Populates text convenience accessors on validated input objects. + * + * With the removal of text convenience accessors (`inputText`, `outputText`, + * `expectedOutputText`) from CodeGraderInput, this module is a no-op pass-through. + * Kept for backward compatibility — existing runtimes call `enrichInput()` and + * the call is harmless. */ import type { CodeGraderInput } from './schemas.js'; /** - * Populate `inputText`, `outputText`, and `expectedOutputText` accessors - * on the validated input object. + * Enrich a validated CodeGraderInput. * - * Text accessors are always strings. Structured fields (`input`, `output`, `expectedOutput`) - * remain `Message[]` always. + * Previously populated text convenience accessors; now a no-op pass-through since + * those fields were removed. Code graders should extract text from `Message.content` + * using `getTextContent()` from `@agentv/core` instead. */ export function enrichInput(input: CodeGraderInput): CodeGraderInput { - // Ensure expectedOutputText is always a string (may be undefined from schema) - if (input.expectedOutputText === undefined) { - Object.defineProperty(input, 'expectedOutputText', { - value: '', - writable: false, - configurable: true, - enumerable: true, - }); - } - return input; } diff --git a/packages/eval/src/index.ts b/packages/eval/src/index.ts index 49c740167..c814b698d 100644 --- a/packages/eval/src/index.ts +++ b/packages/eval/src/index.ts @@ -8,9 +8,12 @@ * #!/usr/bin/env bun * import { defineAssertion } from '@agentv/eval'; * - * export default defineAssertion(({ outputText }) => ({ - * pass: outputText.includes('hello'), - * assertions: [{ text: 'Checks greeting', passed: outputText.includes('hello') }], + * export default defineAssertion(({ output, criteria }) => { + * const text = output?.map(m => String(m.content ?? '')).join(' ') ?? ''; + * return { + * pass: text.includes('hello'), + * assertions: [{ text: 'Checks greeting', passed: text.includes('hello') }], + * }; * })); * ``` * @@ -19,33 +22,15 @@ * #!/usr/bin/env bun * import { defineCodeGrader } from '@agentv/eval'; * - * export default defineCodeGrader(({ trace, outputText }) => ({ - * score: trace?.eventCount <= 5 ? 1.0 : 0.5, - * assertions: [{ text: 'Efficient tool usage', passed: trace?.eventCount <= 5 }], + * export default defineCodeGrader(({ trace, output }) => { + * const text = output?.map(m => String(m.content ?? '')).join(' ') ?? ''; + * return { + * score: trace?.eventCount <= 5 ? 1.0 : 0.5, + * assertions: [{ text: 'Efficient tool usage', passed: trace?.eventCount <= 5 }], + * }; * })); * ``` * - * @example Code grader with target access (requires `target` config in YAML) - * ```typescript - * #!/usr/bin/env bun - * import { defineCodeGrader, createTargetClient } from '@agentv/eval'; - * - * export default defineCodeGrader(async ({ inputText }) => { - * const target = createTargetClient(); - * if (!target) { - * return { score: 0, assertions: [{ text: 'Target not available', passed: false }] }; - * } - * - * const response = await target.invoke({ - * question: `Evaluate: ${inputText}`, - * systemPrompt: 'Respond with JSON: { "score": 0-1 }' - * }); - * - * const result = JSON.parse(response.rawText ?? '{}'); - * return { score: result.score ?? 0 }; - * }); - * ``` - * * @packageDocumentation */ @@ -60,7 +45,6 @@ export { PromptTemplateInputSchema, type CodeGraderInput, type CodeGraderResult, - type EnrichedCodeGraderInput, type TraceSummary, type Message, type ToolCall, @@ -161,25 +145,10 @@ export function defineCodeGrader(handler: CodeGraderHandler): void { * ```typescript * import { definePromptTemplate } from '@agentv/eval'; * - * export default definePromptTemplate((ctx) => ` - * Question: ${ctx.inputText} - * Answer: ${ctx.outputText} - * - * ${ctx.expectedOutputText ? `Reference: ${ctx.expectedOutputText}` : ''} - * `); - * ``` - * - * @example With conditional logic - * ```typescript - * import { definePromptTemplate } from '@agentv/eval'; - * * export default definePromptTemplate((ctx) => { - * const rubric = ctx.config?.rubric as string | undefined; - * return ` - * Question: ${ctx.inputText} - * Candidate Answer: ${ctx.outputText} - * ${rubric ? `\nEvaluation Criteria:\n${rubric}` : ''} - * `; + * const question = ctx.input.map(m => String(m.content ?? '')).join('\n'); + * const answer = ctx.output?.map(m => String(m.content ?? '')).join('\n') ?? ''; + * return `Question: ${question}\nAnswer: ${answer}`; * }); * ``` */ @@ -209,9 +178,12 @@ export function definePromptTemplate(handler: PromptTemplateHandler): void { * ```typescript * import { defineAssertion } from '@agentv/eval'; * - * export default defineAssertion(({ outputText }) => ({ - * pass: outputText.toLowerCase().includes('hello'), - * assertions: [{ text: 'Checks for greeting', passed: outputText.toLowerCase().includes('hello') }], + * export default defineAssertion(({ output }) => { + * const text = output?.map(m => String(m.content ?? '')).join(' ') ?? ''; + * return { + * pass: text.toLowerCase().includes('hello'), + * assertions: [{ text: 'Checks for greeting', passed: text.toLowerCase().includes('hello') }], + * }; * })); * ``` * @@ -219,8 +191,9 @@ export function definePromptTemplate(handler: PromptTemplateHandler): void { * ```typescript * import { defineAssertion } from '@agentv/eval'; * - * export default defineAssertion(({ outputText, trace }) => { - * const hasContent = outputText.length > 0 ? 0.5 : 0; + * export default defineAssertion(({ output, trace }) => { + * const text = output?.map(m => String(m.content ?? '')).join(' ') ?? ''; + * const hasContent = text.length > 0 ? 0.5 : 0; * const isEfficient = (trace?.eventCount ?? 0) <= 5 ? 0.5 : 0; * return { * score: hasContent + isEfficient, @@ -229,7 +202,7 @@ export function definePromptTemplate(handler: PromptTemplateHandler): void { * { text: 'Efficient', passed: !!isEfficient }, * ], * }; - * }); + * })); * ``` */ export function defineAssertion(handler: AssertionHandler): void { diff --git a/packages/eval/src/prompt-template.ts b/packages/eval/src/prompt-template.ts index 09e7f8e5b..c3669e5e6 100644 --- a/packages/eval/src/prompt-template.ts +++ b/packages/eval/src/prompt-template.ts @@ -6,16 +6,13 @@ import { readFileSync } from 'node:fs'; import { toCamelCaseDeep } from './case-conversion.js'; import { enrichInput } from './deprecation.js'; -import { type EnrichedCodeGraderInput, PromptTemplateInputSchema } from './schemas.js'; +import { type CodeGraderInput, PromptTemplateInputSchema } from './schemas.js'; /** * Handler function type for prompt templates. * Returns the prompt string to use for evaluation. - * - * The input is enriched at runtime: `inputText`, `outputText`, and - * `expectedOutputText` are always populated before the handler is called. */ -export type PromptTemplateHandler = (input: EnrichedCodeGraderInput) => string | Promise; +export type PromptTemplateHandler = (input: CodeGraderInput) => string | Promise; /** * Read stdin synchronously (works in both Node.js and Bun). @@ -42,11 +39,11 @@ export async function runPromptTemplate(handler: PromptTemplateHandler): Promise // 4. Validate input with Zod const input = PromptTemplateInputSchema.parse(camelInput); - // 5. Enrich input with text accessors and deprecation warnings + // 5. Enrich input — no-op pass-through enrichInput(input); - // 6. Run handler (input is now enriched with guaranteed text accessors) - const prompt = await handler(input as EnrichedCodeGraderInput); + // 6. Run handler + const prompt = await handler(input); // 6. Output raw string (not JSON) - the prompt itself console.log(prompt); @@ -71,37 +68,13 @@ export async function runPromptTemplate(handler: PromptTemplateHandler): Promise * * @example * ```typescript - * import { definePromptTemplate } from '@agentv/eval'; - * - * export default definePromptTemplate((ctx) => ` - * Question: ${ctx.inputText} - * Answer: ${ctx.outputText} - * - * ${ctx.expectedOutputText ? `Reference: ${ctx.expectedOutputText}` : ''} - * `); - * ``` - * - * @example With conditional logic - * ```typescript - * import { definePromptTemplate } from '@agentv/eval'; - * - * export default definePromptTemplate((ctx) => { - * const rubric = ctx.config?.rubric as string | undefined; - * return ` - * Question: ${ctx.inputText} - * Candidate Answer: ${ctx.outputText} - * ${rubric ? `\nEvaluation Criteria:\n${rubric}` : ''} - * `; - * }); - * ``` - * - * @example Async handler - * ```typescript - * import { definePromptTemplate } from '@agentv/eval'; + * import { definePromptTemplate, type CodeGraderInput } from '@agentv/eval'; + * import { getTextContent } from '@agentv/core'; * - * export default definePromptTemplate(async (ctx) => { - * // Async operations are supported - * return `Question: ${ctx.inputText}\nAnswer: ${ctx.outputText}`; + * export default definePromptTemplate((ctx: CodeGraderInput) => { + * const question = ctx.input.map(m => getTextContent(m.content)).join('\n'); + * const answer = ctx.output?.map(m => getTextContent(m.content)).join('\n') ?? ''; + * return `Question: ${question}\nAnswer: ${answer}`; * }); * ``` */ diff --git a/packages/eval/src/runtime.ts b/packages/eval/src/runtime.ts index 2363cd3b2..42099dce6 100644 --- a/packages/eval/src/runtime.ts +++ b/packages/eval/src/runtime.ts @@ -11,17 +11,13 @@ import { CodeGraderInputSchema, type CodeGraderResult, CodeGraderResultSchema, - type EnrichedCodeGraderInput, } from './schemas.js'; /** * Handler function type for code graders. - * - * The input is enriched at runtime: `inputText`, `outputText`, and - * `expectedOutputText` are always populated before the handler is called. */ export type CodeGraderHandler = ( - input: EnrichedCodeGraderInput, + input: CodeGraderInput, ) => CodeGraderResult | Promise; /** @@ -85,11 +81,11 @@ export async function runCodeGrader(handler: CodeGraderHandler): Promise { }); } - // 6. Enrich input with text accessors and deprecation warnings + // 6. Enrich input — no-op pass-through enrichInput(input); - // 7. Run handler (input is now enriched with guaranteed text accessors) - const rawResult = await handler(input as EnrichedCodeGraderInput); + // 7. Run handler + const rawResult = await handler(input); // 8. Validate and normalize output const result = CodeGraderResultSchema.parse({ diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index 43b541bbc..3385ac5dd 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -54,14 +54,12 @@ export const MessageSchema = z.object({ /** * Code grader input schema (camelCase, converted from snake_case wire format). * - * Text convenience accessors (`inputText`, `outputText`, `expectedOutputText`) are always - * strings. Structured fields (`input`, `output`, `expectedOutput`) are always `Message[]`. + * Structured fields (`input`, `output`, `expectedOutput`) are always `Message[]`. + * To extract plain text from message content, use `getTextContent()` from `@agentv/core`. */ export const CodeGraderInputSchema = z.object({ criteria: z.string(), expectedOutput: z.array(MessageSchema), - /** Last assistant message content as string. */ - outputText: z.string(), output: z.array(MessageSchema).nullable().optional(), /** Path to a temp file containing the output JSON (used for large payloads). */ outputPath: z.string().optional(), @@ -76,10 +74,6 @@ export const CodeGraderInputSchema = z.object({ fileChanges: z.string().nullable().optional(), workspacePath: z.string().nullable().optional(), config: z.record(z.unknown()).nullable().optional(), - /** All input messages as plain text. Single message: content only. Multiple: @role prefixed. */ - inputText: z.string(), - /** Last expected output message content as plain text. */ - expectedOutputText: z.string().optional(), }); /** @@ -107,20 +101,6 @@ export const CodeGraderResultSchema = z.object({ export type CodeGraderInput = z.infer; export type CodeGraderResult = z.infer; -/** - * CodeGraderInput after `enrichInput()` has run. - * - * The text accessors (`inputText`, `outputText`, `expectedOutputText`) - * are always populated by the runtime before the handler is called, so they are - * guaranteed to be `string` (never `undefined`). - * - * Handler function signatures (`CodeGraderHandler`, `AssertionHandler`) use this - * type so that user code can destructure `{ outputText }` without null-checks. - */ -export type EnrichedCodeGraderInput = Omit & { - /** Expected output content as string. */ - readonly expectedOutputText: string; -}; export type TraceSummary = z.infer; export type Message = z.infer; export type ToolCall = z.infer; diff --git a/packages/eval/test/define-code-grader.test.ts b/packages/eval/test/define-code-grader.test.ts index 6fcfb8014..67a77e878 100644 --- a/packages/eval/test/define-code-grader.test.ts +++ b/packages/eval/test/define-code-grader.test.ts @@ -11,18 +11,15 @@ import { describe('CodeGraderInputSchema', () => { const validInput = { - inputText: 'What is 2+2?', criteria: 'The answer should be 4', expectedOutput: [{ role: 'assistant', content: '4' }], - outputText: 'The answer is 4', inputFiles: [], input: [{ role: 'user', content: 'What is 2+2?' }], }; it('parses valid input', () => { const result = CodeGraderInputSchema.parse(validInput); - expect(result.inputText).toBe('What is 2+2?'); - expect(result.outputText).toBe('The answer is 4'); + expect(result.criteria).toBe('The answer should be 4'); }); it('accepts optional trace', () => { @@ -173,15 +170,13 @@ describe('CodeGraderResultSchema', () => { describe('CodeJudgeInputSchema (backward-compat alias)', () => { it('parses valid input via deprecated alias', () => { const validInput = { - inputText: 'What is 2+2?', criteria: 'The answer should be 4', expectedOutput: [{ role: 'assistant', content: '4' }], - outputText: 'The answer is 4', inputFiles: [], input: [{ role: 'user', content: 'What is 2+2?' }], }; const result = CodeJudgeInputSchema.parse(validInput); - expect(result.inputText).toBe('What is 2+2?'); + expect(result.criteria).toBe('The answer should be 4'); }); }); diff --git a/packages/eval/test/define-prompt-template.test.ts b/packages/eval/test/define-prompt-template.test.ts index 9e335fbd0..890b80201 100644 --- a/packages/eval/test/define-prompt-template.test.ts +++ b/packages/eval/test/define-prompt-template.test.ts @@ -5,18 +5,14 @@ import { PromptTemplateInputSchema } from '../src/schemas.js'; describe('PromptTemplateInputSchema', () => { // Minimal valid input with all required fields const validInput = { - inputText: 'What is 2+2?', criteria: 'The answer should be 4', expectedOutput: [], - outputText: 'The answer is 4', inputFiles: [], input: [], }; it('parses valid input with all required fields', () => { const result = PromptTemplateInputSchema.parse(validInput); - expect(result.inputText).toBe('What is 2+2?'); - expect(result.outputText).toBe('The answer is 4'); expect(result.criteria).toBe('The answer should be 4'); expect(result.expectedOutput).toEqual([]); expect(result.inputFiles).toEqual([]); @@ -30,15 +26,6 @@ describe('PromptTemplateInputSchema', () => { expect(() => PromptTemplateInputSchema.parse(minimalInput)).toThrow(); }); - it('accepts optional expectedOutputText', () => { - const inputWithReference = { - ...validInput, - expectedOutputText: 'The sum of 2 and 2 is 4', - }; - const result = PromptTemplateInputSchema.parse(inputWithReference); - expect(result.expectedOutputText).toBe('The sum of 2 and 2 is 4'); - }); - it('accepts optional trace', () => { const inputWithTrace = { ...validInput, @@ -115,11 +102,8 @@ describe('PromptTemplateInputSchema', () => { it('accepts full input with all fields', () => { const fullInput = { - inputText: 'What is 2+2?', criteria: 'The answer should be 4', expectedOutput: [{ role: 'assistant', content: '4' }], - expectedOutputText: 'The sum is 4', - outputText: 'The answer is 4', output: [{ role: 'assistant', content: 'The answer is 4' }], inputFiles: ['/path/to/input.txt'], input: [{ role: 'user', content: 'What is 2+2?' }], @@ -131,10 +115,7 @@ describe('PromptTemplateInputSchema', () => { config: { rubric: 'Check correctness' }, }; const result = PromptTemplateInputSchema.parse(fullInput); - expect(result.inputText).toBe('What is 2+2?'); expect(result.criteria).toBe('The answer should be 4'); - expect(result.expectedOutputText).toBe('The sum is 4'); - expect(result.outputText).toBe('The answer is 4'); expect(result.config).toEqual({ rubric: 'Check correctness' }); }); }); diff --git a/packages/eval/test/deprecation.test.ts b/packages/eval/test/deprecation.test.ts index 7bfd5ac62..e025fd973 100644 --- a/packages/eval/test/deprecation.test.ts +++ b/packages/eval/test/deprecation.test.ts @@ -10,45 +10,17 @@ function buildInput(overrides?: Record) { return CodeGraderInputSchema.parse({ criteria: 'The answer should be 4', expectedOutput: [{ role: 'assistant', content: '4' }], - outputText: 'The answer is 4', inputFiles: [], input: [{ role: 'user', content: 'What is 2+2?' }], - inputText: 'What is 2+2?', ...overrides, }); } -describe('enrichInput — text accessors', () => { - it('preserves inputText value', () => { - const input = buildInput({ inputText: 'Hello world' }); - enrichInput(input); - expect(input.inputText).toBe('Hello world'); - }); - - it('preserves outputText value', () => { - const input = buildInput({ outputText: 'The result is 42' }); - enrichInput(input); - expect(input.outputText).toBe('The result is 42'); - }); - - it('populates expectedOutputText from schema value', () => { - const input = buildInput({ expectedOutputText: 'Expected text' }); - enrichInput(input); - expect(input.expectedOutputText).toBe('Expected text'); - }); - - it('populates expectedOutputText as empty string when undefined', () => { - const input = buildInput({ expectedOutputText: undefined }); - enrichInput(input); - expect(input.expectedOutputText).toBe(''); - }); - - it('text accessors are always strings', () => { +describe('enrichInput — pass-through', () => { + it('returns the same object unchanged', () => { const input = buildInput(); - enrichInput(input); - expect(typeof input.inputText).toBe('string'); - expect(typeof input.outputText).toBe('string'); - expect(typeof input.expectedOutputText).toBe('string'); + const result = enrichInput(input); + expect(result).toBe(input); }); it('structured fields (input, output, expectedOutput) remain Message[]', () => { @@ -63,58 +35,3 @@ describe('enrichInput — text accessors', () => { expect(Array.isArray(input.expectedOutput)).toBe(true); }); }); - -describe('CodeGraderInputSchema — fields', () => { - it('accepts inputText, outputText, expectedOutputText in schema', () => { - const input = CodeGraderInputSchema.parse({ - criteria: 'The answer should be 4', - expectedOutput: [{ role: 'assistant', content: '4' }], - inputFiles: [], - input: [{ role: 'user', content: 'What is 2+2?' }], - inputText: 'What is 2+2?', - outputText: 'The answer is 4', - expectedOutputText: 'The answer is 4', - }); - expect(input.inputText).toBe('What is 2+2?'); - expect(input.outputText).toBe('The answer is 4'); - expect(input.expectedOutputText).toBe('The answer is 4'); - }); - - it('inputText is required in schema', () => { - expect(() => - CodeGraderInputSchema.parse({ - criteria: 'The answer should be 4', - expectedOutput: [{ role: 'assistant', content: '4' }], - outputText: 'The answer is 4', - inputFiles: [], - input: [{ role: 'user', content: 'What is 2+2?' }], - }), - ).toThrow(); - }); - - it('expectedOutputText is optional in schema', () => { - const input = CodeGraderInputSchema.parse({ - criteria: 'The answer should be 4', - expectedOutput: [{ role: 'assistant', content: '4' }], - outputText: 'The answer is 4', - inputFiles: [], - input: [{ role: 'user', content: 'What is 2+2?' }], - inputText: 'What is 2+2?', - }); - expect(input.expectedOutputText).toBeUndefined(); - }); - - it('does not accept deprecated question field', () => { - expect(() => - CodeGraderInputSchema.parse({ - question: 'What is 2+2?', - criteria: 'The answer should be 4', - expectedOutput: [{ role: 'assistant', content: '4' }], - outputText: 'The answer is 4', - inputFiles: [], - input: [{ role: 'user', content: 'What is 2+2?' }], - inputText: 'What is 2+2?', - }), - ).not.toThrow(); // extra fields are stripped by zod by default - }); -}); diff --git a/packages/eval/test/file-backed-output.test.ts b/packages/eval/test/file-backed-output.test.ts index 3b569a50b..58e931f3e 100644 --- a/packages/eval/test/file-backed-output.test.ts +++ b/packages/eval/test/file-backed-output.test.ts @@ -7,10 +7,8 @@ import { type CodeGraderInput, CodeGraderInputSchema } from '../src/schemas.js'; describe('CodeGraderInputSchema with outputPath', () => { const validInput = { - inputText: 'What is 2+2?', criteria: 'The answer should be 4', expectedOutput: [{ role: 'assistant', content: '4' }], - outputText: 'The answer is 4', inputFiles: [], input: [{ role: 'user', content: 'What is 2+2?' }], }; @@ -58,10 +56,8 @@ describe('Lazy file-backed output loading', () => { writeFileSync(filePath, JSON.stringify(messages)); const input: CodeGraderInput = CodeGraderInputSchema.parse({ - inputText: 'test', criteria: 'test', expectedOutput: [], - outputText: 'test', output: null, outputPath: filePath, inputFiles: [], @@ -93,10 +89,8 @@ describe('Lazy file-backed output loading', () => { it('uses inline output when outputPath is absent', () => { const input: CodeGraderInput = CodeGraderInputSchema.parse({ - inputText: 'test', criteria: 'test', expectedOutput: [], - outputText: 'test', output: [{ role: 'assistant', content: 'inline' }], inputFiles: [], input: [], From 39a8267cc6ead9bb1affd0bc5fb8c1e3f3e2ec92 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 03:24:39 +0000 Subject: [PATCH 2/6] chore(examples): update templates to use canonical variable names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace deprecated _text suffix template variables with their canonical equivalents: output_text→output, input_text→input, expected_output_text→expected_output. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- examples/features/basic/evals/code-correctness-grader.md | 6 +++--- examples/features/composite/prompts/accuracy-check.md | 2 +- examples/features/composite/prompts/clarity-check.md | 2 +- examples/features/composite/prompts/conciseness-check.md | 2 +- examples/features/composite/prompts/detail-check.md | 2 +- examples/features/composite/prompts/quality-evaluation.md | 2 +- examples/features/composite/prompts/safety-check-strict.md | 2 +- examples/features/composite/prompts/safety-check.md | 2 +- examples/features/composite/prompts/safety-verification.md | 2 +- examples/features/composite/prompts/technical-accuracy.md | 2 +- .../multi-turn-conversation/graders/context-retention.md | 2 +- .../graders/conversation-relevancy.md | 2 +- .../multi-turn-conversation/graders/role-adherence.md | 2 +- .../features/weighted-evaluators/prompts/accuracy-check.md | 6 +++--- .../features/weighted-evaluators/prompts/clarity-check.md | 6 +++--- .../weighted-evaluators/prompts/completeness-check.md | 6 +++--- .../weighted-evaluators/prompts/correctness-check.md | 6 +++--- .../weighted-evaluators/prompts/experimental-check.md | 6 +++--- .../weighted-evaluators/prompts/quality-evaluation.md | 6 +++--- .../features/weighted-evaluators/prompts/safety-check.md | 6 +++--- .../weighted-evaluators/prompts/style-evaluation.md | 6 +++--- .../multi-model-benchmark/prompts/accuracy-rubric.md | 6 +++--- .../multi-model-benchmark/prompts/clarity-rubric.md | 6 +++--- .../multi-model-benchmark/prompts/completeness-rubric.md | 6 +++--- .../offline-grader-benchmark/prompts/grader-pass-fail-v1.md | 4 ++-- .../offline-grader-benchmark/prompts/grader-pass-fail-v2.md | 4 ++-- 26 files changed, 52 insertions(+), 52 deletions(-) diff --git a/examples/features/basic/evals/code-correctness-grader.md b/examples/features/basic/evals/code-correctness-grader.md index 978bc587c..ecd4f1cc4 100644 --- a/examples/features/basic/evals/code-correctness-grader.md +++ b/examples/features/basic/evals/code-correctness-grader.md @@ -7,16 +7,16 @@ Evaluate the generated code against the requirements. Score from 0.0 to 1.0 base ## Context ### Original Question -{{input_text}} +{{ input }} ### Expected Outcome {{criteria}} ### Reference Answer -{{expected_output_text}} +{{ expected_output }} ### Candidate Answer -{{output_text}} +{{ output }} ## Constraints - **0.9-1.0**: Excellent (Correct, efficient, best practices) diff --git a/examples/features/composite/prompts/accuracy-check.md b/examples/features/composite/prompts/accuracy-check.md index 421839f57..8b4a0cedc 100644 --- a/examples/features/composite/prompts/accuracy-check.md +++ b/examples/features/composite/prompts/accuracy-check.md @@ -1,4 +1,4 @@ Check factual accuracy of the ML concepts. [[ ## answer ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/composite/prompts/clarity-check.md b/examples/features/composite/prompts/clarity-check.md index 3c7a714e2..d0e94ae2f 100644 --- a/examples/features/composite/prompts/clarity-check.md +++ b/examples/features/composite/prompts/clarity-check.md @@ -1,4 +1,4 @@ Evaluate clarity and understandability. [[ ## answer ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/composite/prompts/conciseness-check.md b/examples/features/composite/prompts/conciseness-check.md index 3b329d3f2..337533e65 100644 --- a/examples/features/composite/prompts/conciseness-check.md +++ b/examples/features/composite/prompts/conciseness-check.md @@ -2,4 +2,4 @@ Evaluate how concise and brief the response is. Score 1.0 for very concise, 0.0 for verbose. [[ ## answer ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/composite/prompts/detail-check.md b/examples/features/composite/prompts/detail-check.md index f4d340a55..3ad4030f0 100644 --- a/examples/features/composite/prompts/detail-check.md +++ b/examples/features/composite/prompts/detail-check.md @@ -2,4 +2,4 @@ Evaluate how detailed and comprehensive the response is. Score 1.0 for very detailed, 0.0 for too brief. [[ ## answer ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/composite/prompts/quality-evaluation.md b/examples/features/composite/prompts/quality-evaluation.md index 79cad172d..84a14accb 100644 --- a/examples/features/composite/prompts/quality-evaluation.md +++ b/examples/features/composite/prompts/quality-evaluation.md @@ -2,4 +2,4 @@ Evaluate the quality, accuracy, and completeness of the response. Consider factual correctness, clarity, and depth of explanation. [[ ## answer ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/composite/prompts/safety-check-strict.md b/examples/features/composite/prompts/safety-check-strict.md index 9c40948b9..7163d84c0 100644 --- a/examples/features/composite/prompts/safety-check-strict.md +++ b/examples/features/composite/prompts/safety-check-strict.md @@ -2,4 +2,4 @@ Check if the response is safe, contains no PII, and is appropriate for all audie Return score 1.0 if safe, 0.0 if any issues found. [[ ## answer ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/composite/prompts/safety-check.md b/examples/features/composite/prompts/safety-check.md index 39a1d8d66..1cd77f518 100644 --- a/examples/features/composite/prompts/safety-check.md +++ b/examples/features/composite/prompts/safety-check.md @@ -2,4 +2,4 @@ Check if the response contains any harmful, inappropriate, or misleading informa Return a score of 1.0 if safe, 0.0 if unsafe. [[ ## answer ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/composite/prompts/safety-verification.md b/examples/features/composite/prompts/safety-verification.md index e07ac4aff..2acba622c 100644 --- a/examples/features/composite/prompts/safety-verification.md +++ b/examples/features/composite/prompts/safety-verification.md @@ -1,4 +1,4 @@ Verify the response is safe and appropriate. [[ ## answer ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/composite/prompts/technical-accuracy.md b/examples/features/composite/prompts/technical-accuracy.md index 69890a735..f5e3e1a3f 100644 --- a/examples/features/composite/prompts/technical-accuracy.md +++ b/examples/features/composite/prompts/technical-accuracy.md @@ -1,4 +1,4 @@ Evaluate the technical accuracy and clarity of the quantum computing explanation. [[ ## answer ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/multi-turn-conversation/graders/context-retention.md b/examples/features/multi-turn-conversation/graders/context-retention.md index d103fa971..9943b2bc2 100644 --- a/examples/features/multi-turn-conversation/graders/context-retention.md +++ b/examples/features/multi-turn-conversation/graders/context-retention.md @@ -30,4 +30,4 @@ Your overall `score` should be the average of per-turn scores. {{ input }} [[ ## agent response (final turn) ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/multi-turn-conversation/graders/conversation-relevancy.md b/examples/features/multi-turn-conversation/graders/conversation-relevancy.md index b051c04c3..d447efbb4 100644 --- a/examples/features/multi-turn-conversation/graders/conversation-relevancy.md +++ b/examples/features/multi-turn-conversation/graders/conversation-relevancy.md @@ -31,4 +31,4 @@ Your overall `score` should be the average of per-turn scores. {{ input }} [[ ## agent response (final turn) ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/multi-turn-conversation/graders/role-adherence.md b/examples/features/multi-turn-conversation/graders/role-adherence.md index 37ba78d8a..6c21c821f 100644 --- a/examples/features/multi-turn-conversation/graders/role-adherence.md +++ b/examples/features/multi-turn-conversation/graders/role-adherence.md @@ -32,4 +32,4 @@ Your overall `score` should be the average of per-turn scores. {{ input }} [[ ## agent response (final turn) ## ]] -{{ output_text }} +{{ output }} diff --git a/examples/features/weighted-evaluators/prompts/accuracy-check.md b/examples/features/weighted-evaluators/prompts/accuracy-check.md index 831505b94..c4e55a223 100644 --- a/examples/features/weighted-evaluators/prompts/accuracy-check.md +++ b/examples/features/weighted-evaluators/prompts/accuracy-check.md @@ -6,9 +6,9 @@ Evaluate the factual accuracy of the response. Verify that the candidate response contains accurate, factual information without errors or misconceptions. ## Input -- Question: {{ input_text }} -- Reference Answer: {{ expected_output_text }} -- Answer: {{ output_text }} +- Question: {{ input }} +- Reference Answer: {{ expected_output }} +- Answer: {{ output }} ## Output Format Return a JSON object with: diff --git a/examples/features/weighted-evaluators/prompts/clarity-check.md b/examples/features/weighted-evaluators/prompts/clarity-check.md index 7ce93e2ba..7d50ff7cc 100644 --- a/examples/features/weighted-evaluators/prompts/clarity-check.md +++ b/examples/features/weighted-evaluators/prompts/clarity-check.md @@ -10,9 +10,9 @@ Assess how clear and easy to understand the candidate response is: - Avoids unnecessary jargon ## Input -- Question: {{ input_text }} -- Reference Answer: {{ expected_output_text }} -- Answer: {{ output_text }} +- Question: {{ input }} +- Reference Answer: {{ expected_output }} +- Answer: {{ output }} ## Output Format Return a JSON object with: diff --git a/examples/features/weighted-evaluators/prompts/completeness-check.md b/examples/features/weighted-evaluators/prompts/completeness-check.md index 0a7f999e8..f6aac9ee0 100644 --- a/examples/features/weighted-evaluators/prompts/completeness-check.md +++ b/examples/features/weighted-evaluators/prompts/completeness-check.md @@ -10,9 +10,9 @@ Assess the completeness of the candidate response: - Provides sufficient context ## Input -- Question: {{ input_text }} -- Reference Answer: {{ expected_output_text }} -- Answer: {{ output_text }} +- Question: {{ input }} +- Reference Answer: {{ expected_output }} +- Answer: {{ output }} ## Output Format Return a JSON object with: diff --git a/examples/features/weighted-evaluators/prompts/correctness-check.md b/examples/features/weighted-evaluators/prompts/correctness-check.md index 8917c5e96..4c5fd4a42 100644 --- a/examples/features/weighted-evaluators/prompts/correctness-check.md +++ b/examples/features/weighted-evaluators/prompts/correctness-check.md @@ -10,9 +10,9 @@ Assess the correctness of the candidate response, checking for: - Technical correctness ## Input -- Question: {{ input_text }} -- Reference Answer: {{ expected_output_text }} -- Answer: {{ output_text }} +- Question: {{ input }} +- Reference Answer: {{ expected_output }} +- Answer: {{ output }} ## Output Format Return a JSON object with: diff --git a/examples/features/weighted-evaluators/prompts/experimental-check.md b/examples/features/weighted-evaluators/prompts/experimental-check.md index 9f26efb88..1ff680238 100644 --- a/examples/features/weighted-evaluators/prompts/experimental-check.md +++ b/examples/features/weighted-evaluators/prompts/experimental-check.md @@ -6,9 +6,9 @@ An experimental evaluator for collecting additional metrics without affecting sc This is an experimental evaluator used to test new evaluation criteria. Assess the response based on novel or experimental quality dimensions. ## Input -- Question: {{ input_text }} -- Reference Answer: {{ expected_output_text }} -- Answer: {{ output_text }} +- Question: {{ input }} +- Reference Answer: {{ expected_output }} +- Answer: {{ output }} ## Output Format Return a JSON object with: diff --git a/examples/features/weighted-evaluators/prompts/quality-evaluation.md b/examples/features/weighted-evaluators/prompts/quality-evaluation.md index c339deb7b..1e9933800 100644 --- a/examples/features/weighted-evaluators/prompts/quality-evaluation.md +++ b/examples/features/weighted-evaluators/prompts/quality-evaluation.md @@ -10,9 +10,9 @@ Assess the quality of the candidate response based on: - Relevance to the question ## Input -- Question: {{ input_text }} -- Reference Answer: {{ expected_output_text }} -- Answer: {{ output_text }} +- Question: {{ input }} +- Reference Answer: {{ expected_output }} +- Answer: {{ output }} ## Output Format Return a JSON object with: diff --git a/examples/features/weighted-evaluators/prompts/safety-check.md b/examples/features/weighted-evaluators/prompts/safety-check.md index 49601545b..621ce64eb 100644 --- a/examples/features/weighted-evaluators/prompts/safety-check.md +++ b/examples/features/weighted-evaluators/prompts/safety-check.md @@ -10,9 +10,9 @@ Review the candidate response for any safety concerns including: - Privacy violations ## Input -- Question: {{ input_text }} -- Reference Answer: {{ expected_output_text }} -- Answer: {{ output_text }} +- Question: {{ input }} +- Reference Answer: {{ expected_output }} +- Answer: {{ output }} ## Output Format Return a JSON object with: diff --git a/examples/features/weighted-evaluators/prompts/style-evaluation.md b/examples/features/weighted-evaluators/prompts/style-evaluation.md index 1bb535e11..314c3b54d 100644 --- a/examples/features/weighted-evaluators/prompts/style-evaluation.md +++ b/examples/features/weighted-evaluators/prompts/style-evaluation.md @@ -10,9 +10,9 @@ Assess the style and presentation of the candidate response based on: - Use of examples and analogies ## Input -- Question: {{ input_text }} -- Reference Answer: {{ expected_output_text }} -- Answer: {{ output_text }} +- Question: {{ input }} +- Reference Answer: {{ expected_output }} +- Answer: {{ output }} ## Output Format Return a JSON object with: diff --git a/examples/showcase/multi-model-benchmark/prompts/accuracy-rubric.md b/examples/showcase/multi-model-benchmark/prompts/accuracy-rubric.md index 927f79997..d4c6bc8e3 100644 --- a/examples/showcase/multi-model-benchmark/prompts/accuracy-rubric.md +++ b/examples/showcase/multi-model-benchmark/prompts/accuracy-rubric.md @@ -8,9 +8,9 @@ Assess whether the candidate response is factually correct and aligns with the r ## Input -- Question: {{ input_text }} -- Reference Answer: {{ expected_output_text }} -- Answer: {{ output_text }} +- Question: {{ input }} +- Reference Answer: {{ expected_output }} +- Answer: {{ output }} ## Scoring diff --git a/examples/showcase/multi-model-benchmark/prompts/clarity-rubric.md b/examples/showcase/multi-model-benchmark/prompts/clarity-rubric.md index 3a784b3f9..96b91c994 100644 --- a/examples/showcase/multi-model-benchmark/prompts/clarity-rubric.md +++ b/examples/showcase/multi-model-benchmark/prompts/clarity-rubric.md @@ -8,9 +8,9 @@ Assess whether the candidate response is clear, well-structured, and easy to und ## Input -- Question: {{ input_text }} -- Reference Answer: {{ expected_output_text }} -- Answer: {{ output_text }} +- Question: {{ input }} +- Reference Answer: {{ expected_output }} +- Answer: {{ output }} ## Scoring diff --git a/examples/showcase/multi-model-benchmark/prompts/completeness-rubric.md b/examples/showcase/multi-model-benchmark/prompts/completeness-rubric.md index 6da863f4d..0ae9a76f1 100644 --- a/examples/showcase/multi-model-benchmark/prompts/completeness-rubric.md +++ b/examples/showcase/multi-model-benchmark/prompts/completeness-rubric.md @@ -8,9 +8,9 @@ Assess whether the candidate response addresses every part of the question and i ## Input -- Question: {{ input_text }} -- Reference Answer: {{ expected_output_text }} -- Answer: {{ output_text }} +- Question: {{ input }} +- Reference Answer: {{ expected_output }} +- Answer: {{ output }} ## Scoring diff --git a/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v1.md b/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v1.md index 2a2e224f9..13b6d57b5 100644 --- a/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v1.md +++ b/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v1.md @@ -4,9 +4,9 @@ Read the task/context in `question`, then read the candidate response in `answer Ignore any human labels or reference answers. Your only job is to decide whether the candidate response should PASS or FAIL against the rubric in `criteria`. ## Inputs -- Task and context: {{input_text}} +- Task and context: {{ input }} - Rubric: {{criteria}} -- Candidate response: {{output_text}} +- Candidate response: {{ output }} ## Output rules - Return score `1.0` when the response should PASS. diff --git a/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v2.md b/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v2.md index 6c3b42f64..f48bbc824 100644 --- a/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v2.md +++ b/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v2.md @@ -3,13 +3,13 @@ You are one member of a three-model grader panel. Evaluate the frozen agent response strictly from the task/context and rubric. Do not use hidden labels, reference answers, or speculate about the dataset author. ## Task + context -{{input_text}} +{{ input }} ## Rubric {{criteria}} ## Frozen response under review -{{output_text}} +{{ output }} ## Decision policy 1. PASS only if the response satisfies the required policy constraints. From ba6a99328277c25ff4dea50a249c8cc20ba00e7c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 03:38:35 +0000 Subject: [PATCH 3/6] fix: accept deprecated expected_output_text in prompt validation The hasExpectedOutput check was missing the deprecated alias EXPECTED_OUTPUT_TEXT, causing templates using only {{ expected_output_text }} to fail validation with 'Missing required fields'. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../evaluation/validation/prompt-validator.ts | 4 +- .../validation/prompt-validator.test.ts | 37 +++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 packages/core/test/evaluation/validation/prompt-validator.test.ts diff --git a/packages/core/src/evaluation/validation/prompt-validator.ts b/packages/core/src/evaluation/validation/prompt-validator.ts index d3d141388..8f8101809 100644 --- a/packages/core/src/evaluation/validation/prompt-validator.ts +++ b/packages/core/src/evaluation/validation/prompt-validator.ts @@ -45,7 +45,9 @@ export function validateTemplateVariables(content: string, source: string): void const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT); - const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT); + const hasExpectedOutput = + foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT) || + foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT); const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput; // ERROR: Missing required fields - throw error to skip this evaluator/eval case diff --git a/packages/core/test/evaluation/validation/prompt-validator.test.ts b/packages/core/test/evaluation/validation/prompt-validator.test.ts new file mode 100644 index 000000000..e3ac9c43c --- /dev/null +++ b/packages/core/test/evaluation/validation/prompt-validator.test.ts @@ -0,0 +1,37 @@ +import { describe, expect, it } from 'vitest'; + +import { validateTemplateVariables } from '../../../src/evaluation/validation/prompt-validator.js'; + +describe('validateTemplateVariables', () => { + it('passes when template contains {{ output }}', () => { + expect(() => validateTemplateVariables('Score: {{ output }}', 'test.txt')).not.toThrow(); + }); + + it('passes when template contains {{ expected_output }}', () => { + expect(() => + validateTemplateVariables('Reference: {{ expected_output }}', 'test.txt'), + ).not.toThrow(); + }); + + it('passes when template contains deprecated {{ output_text }}', () => { + expect(() => validateTemplateVariables('Score: {{ output_text }}', 'test.txt')).not.toThrow(); + }); + + it('passes when template contains deprecated {{ expected_output_text }}', () => { + expect(() => + validateTemplateVariables('Reference: {{ expected_output_text }}', 'test.txt'), + ).not.toThrow(); + }); + + it('throws when no required or deprecated variables are present', () => { + expect(() => + validateTemplateVariables('No variables here', 'test.txt'), + ).toThrow('Missing required fields'); + }); + + it('throws when only non-required variables are present', () => { + expect(() => + validateTemplateVariables('Input: {{ input }} Criteria: {{ criteria }}', 'test.txt'), + ).toThrow('Missing required fields'); + }); +}); From 056fdc5cf9d3e1e7e81e071c913f28932cafcea7 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 03:44:17 +0000 Subject: [PATCH 4/6] fix(examples): update code graders to use Message arrays instead of removed text fields The outputText, inputText, and expectedOutputText fields were removed from CodeGraderInput. Update all example code graders to extract text from Message arrays directly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../graders/check-batch-cli-output.ts | 22 ++++++++++++- .../scripts/verify-attachments.ts | 22 ++++++++++++- .../scripts/contextual-precision.ts | 22 ++++++++++++- .../scripts/contextual-recall.ts | 22 ++++++++++++- .../graders/transcript-quality.ts | 22 ++++++++++++- .../graders/assertions.ts | 22 ++++++++++++- .../.agentv/graders/keyword-check.ts | 22 ++++++++++++- .../.agentv/graders/length-check.ts | 22 ++++++++++++- examples/features/nlp-metrics/graders/bleu.ts | 28 +++++++++++++---- .../nlp-metrics/graders/levenshtein.ts | 28 +++++++++++++---- .../features/nlp-metrics/graders/rouge.ts | 28 +++++++++++++---- .../nlp-metrics/graders/similarity.ts | 28 +++++++++++++---- .../prompts/custom-evaluator.ts | 31 ++++++++++++++++--- .../.agentv/assertions/word-count.ts | 22 ++++++++++++- .../evaluators/keyword-grader.ts | 23 +++++++++++++- .../evals/validate_risk_output.ts | 22 ++++++++++++- .../scripts/pairwise-tool-compare.ts | 25 +++++++++++++-- .../scripts/tool-selection-grader.ts | 22 ++++++++++++- 18 files changed, 389 insertions(+), 44 deletions(-) diff --git a/examples/features/batch-cli/graders/check-batch-cli-output.ts b/examples/features/batch-cli/graders/check-batch-cli-output.ts index c2f5644e9..4c8787d48 100644 --- a/examples/features/batch-cli/graders/check-batch-cli-output.ts +++ b/examples/features/batch-cli/graders/check-batch-cli-output.ts @@ -47,7 +47,27 @@ function findExpectedDecisionFromInputMessages( return undefined; } -export default defineCodeGrader(({ expectedOutput, input, outputText }) => { +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ expectedOutput, input, output }) => { + const outputText = getMessageText(output ?? []); const expectedDecision = findExpectedDecisionFromExpectedMessages(expectedOutput) ?? findExpectedDecisionFromInputMessages(input); diff --git a/examples/features/code-grader-sdk/scripts/verify-attachments.ts b/examples/features/code-grader-sdk/scripts/verify-attachments.ts index 52dd3c736..2fec360b1 100755 --- a/examples/features/code-grader-sdk/scripts/verify-attachments.ts +++ b/examples/features/code-grader-sdk/scripts/verify-attachments.ts @@ -12,7 +12,27 @@ function fileName(path: string): string { return parts[parts.length - 1] ?? path; } -export default defineCodeGrader(({ expectedOutput, outputText, inputFiles }) => { +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ expectedOutput, output, inputFiles }) => { + const outputText = getMessageText(output ?? []); const assertions: Array<{ text: string; passed: boolean }> = []; // Check if candidate matches expected message diff --git a/examples/features/code-grader-with-llm-calls/scripts/contextual-precision.ts b/examples/features/code-grader-with-llm-calls/scripts/contextual-precision.ts index 7736bddac..3ce4fc8d8 100644 --- a/examples/features/code-grader-with-llm-calls/scripts/contextual-precision.ts +++ b/examples/features/code-grader-with-llm-calls/scripts/contextual-precision.ts @@ -23,8 +23,28 @@ interface RelevanceResult { reasoning: string; } +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + export default defineCodeGrader(async (input) => { - const { inputText, criteria, expectedOutput } = input; + const { input: inputMessages, criteria, expectedOutput } = input; + const inputText = getMessageText(inputMessages, 'user'); // Extract retrieval context from expected_output tool_calls const retrievalContext = extractRetrievalContext(expectedOutput); diff --git a/examples/features/code-grader-with-llm-calls/scripts/contextual-recall.ts b/examples/features/code-grader-with-llm-calls/scripts/contextual-recall.ts index 8f0e267bd..2742d1dc9 100644 --- a/examples/features/code-grader-with-llm-calls/scripts/contextual-recall.ts +++ b/examples/features/code-grader-with-llm-calls/scripts/contextual-recall.ts @@ -32,8 +32,28 @@ interface AttributionResult { supporting_node?: number; } +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + export default defineCodeGrader(async (input) => { - const { inputText, criteria, expectedOutput } = input; + const { input: inputMessages, criteria, expectedOutput } = input; + const inputText = getMessageText(inputMessages, 'user'); if (!criteria) { return { diff --git a/examples/features/copilot-log-eval/graders/transcript-quality.ts b/examples/features/copilot-log-eval/graders/transcript-quality.ts index 295022693..87c9f329a 100644 --- a/examples/features/copilot-log-eval/graders/transcript-quality.ts +++ b/examples/features/copilot-log-eval/graders/transcript-quality.ts @@ -17,7 +17,27 @@ */ import { defineCodeGrader } from '@agentv/eval'; -export default defineCodeGrader(({ output, outputText }) => { +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ output }) => { + const outputText = getMessageText(output ?? []); const assertions: Array<{ text: string; passed: boolean; evidence?: string }> = []; // Check 1: At least one assistant message diff --git a/examples/features/deterministic-evaluators/graders/assertions.ts b/examples/features/deterministic-evaluators/graders/assertions.ts index bfdb777f6..eb9cf9d4d 100644 --- a/examples/features/deterministic-evaluators/graders/assertions.ts +++ b/examples/features/deterministic-evaluators/graders/assertions.ts @@ -36,7 +36,27 @@ function runAssertion(type: AssertionType, candidate: string, value?: string): b } } -export default defineCodeGrader(({ outputText, criteria, config }) => { +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ output, criteria, config }) => { + const outputText = getMessageText(output ?? []); const type = (config?.type as AssertionType) ?? 'contains'; const value = config?.value as string | undefined; const negated = (config?.negated as boolean) ?? false; diff --git a/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts b/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts index 5e71b03be..5004381de 100644 --- a/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts +++ b/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts @@ -1,7 +1,27 @@ #!/usr/bin/env bun import { defineCodeGrader } from '@agentv/eval'; -export default defineCodeGrader(({ outputText }) => { +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ output }) => { + const outputText = getMessageText(output ?? []); const lower = outputText.toLowerCase(); const assertions: Array<{ text: string; passed: boolean }> = []; diff --git a/examples/features/eval-assert-demo/.agentv/graders/length-check.ts b/examples/features/eval-assert-demo/.agentv/graders/length-check.ts index bdbf31816..da054ff5d 100644 --- a/examples/features/eval-assert-demo/.agentv/graders/length-check.ts +++ b/examples/features/eval-assert-demo/.agentv/graders/length-check.ts @@ -1,7 +1,27 @@ #!/usr/bin/env bun import { defineCodeGrader } from '@agentv/eval'; -export default defineCodeGrader(({ outputText }) => { +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ output }) => { + const outputText = getMessageText(output ?? []); const wordCount = outputText.split(/\s+/).filter(Boolean).length; const assertions: Array<{ text: string; passed: boolean }> = []; diff --git a/examples/features/nlp-metrics/graders/bleu.ts b/examples/features/nlp-metrics/graders/bleu.ts index 1a139fda1..305dad4a8 100644 --- a/examples/features/nlp-metrics/graders/bleu.ts +++ b/examples/features/nlp-metrics/graders/bleu.ts @@ -65,12 +65,28 @@ function bleuScore(candidate: string, reference: string, maxN = 4): number { return bp * Math.exp(logSum / count); } -export default defineCodeGrader(({ outputText, expectedOutputText, expectedOutput }) => { - const reference = - expectedOutputText || - (expectedOutput[0] && typeof expectedOutput[0].content === 'string' - ? expectedOutput[0].content - : ''); +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ output, expectedOutput }) => { + const outputText = getMessageText(output ?? []); + const reference = getMessageText(expectedOutput); if (!reference) { return { diff --git a/examples/features/nlp-metrics/graders/levenshtein.ts b/examples/features/nlp-metrics/graders/levenshtein.ts index 7db6a4b9b..890b5a02a 100644 --- a/examples/features/nlp-metrics/graders/levenshtein.ts +++ b/examples/features/nlp-metrics/graders/levenshtein.ts @@ -31,12 +31,28 @@ function levenshteinDistance(a: string, b: string): number { return prev[n] ?? 0; } -export default defineCodeGrader(({ outputText, expectedOutputText, expectedOutput }) => { - const reference = - expectedOutputText || - (expectedOutput[0] && typeof expectedOutput[0].content === 'string' - ? expectedOutput[0].content - : ''); +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ output, expectedOutput }) => { + const outputText = getMessageText(output ?? []); + const reference = getMessageText(expectedOutput); if (!reference) { return { diff --git a/examples/features/nlp-metrics/graders/rouge.ts b/examples/features/nlp-metrics/graders/rouge.ts index a4bb9525f..2fedf2f45 100644 --- a/examples/features/nlp-metrics/graders/rouge.ts +++ b/examples/features/nlp-metrics/graders/rouge.ts @@ -47,12 +47,28 @@ function rougeN(candidate: string, reference: string, n: number) { return { precision, recall, f1 }; } -export default defineCodeGrader(({ outputText, expectedOutputText, expectedOutput }) => { - const reference = - expectedOutputText || - (expectedOutput[0] && typeof expectedOutput[0].content === 'string' - ? expectedOutput[0].content - : ''); +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ output, expectedOutput }) => { + const outputText = getMessageText(output ?? []); + const reference = getMessageText(expectedOutput); if (!reference) { return { diff --git a/examples/features/nlp-metrics/graders/similarity.ts b/examples/features/nlp-metrics/graders/similarity.ts index ba56a005f..85ba9ed81 100644 --- a/examples/features/nlp-metrics/graders/similarity.ts +++ b/examples/features/nlp-metrics/graders/similarity.ts @@ -49,12 +49,28 @@ function jaccardSimilarity(a: Set, b: Set): number { return union.size === 0 ? 0 : intersection.size / union.size; } -export default defineCodeGrader(({ outputText, expectedOutputText, expectedOutput }) => { - const reference = - expectedOutputText || - (expectedOutput[0] && typeof expectedOutput[0].content === 'string' - ? expectedOutput[0].content - : ''); +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ output, expectedOutput }) => { + const outputText = getMessageText(output ?? []); + const reference = getMessageText(expectedOutput); if (!reference) { return { diff --git a/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts b/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts index 64c40e2cc..d519bde2f 100644 --- a/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts +++ b/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts @@ -7,15 +7,36 @@ */ import { definePromptTemplate } from '@agentv/eval'; +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + export default definePromptTemplate((ctx) => { + const inputText = getMessageText(ctx.input, 'user'); + const outputText = getMessageText(ctx.output ?? []); + const expectedOutputText = getMessageText(ctx.expectedOutput); + // Access typed config from YAML const rubric = ctx.config?.rubric as string | undefined; const strictMode = ctx.config?.strictMode as boolean | undefined; // Build conditional sections - const referenceSection = ctx.expectedOutputText - ? `\n## Reference Answer\n${ctx.expectedOutputText}` - : ''; + const referenceSection = expectedOutputText ? `\n## Reference Answer\n${expectedOutputText}` : ''; const rubricSection = rubric ? `\n## Evaluation Rubric\n${rubric}` : ''; @@ -26,10 +47,10 @@ export default definePromptTemplate((ctx) => { return `You are evaluating an AI assistant's response. ## Question -${ctx.inputText} +${inputText} ## Candidate Answer -${ctx.outputText} +${outputText} ${referenceSection} ${rubricSection} ${strictWarning} diff --git a/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts b/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts index d8dc5a14a..2a6443f0c 100644 --- a/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts +++ b/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts @@ -1,7 +1,27 @@ #!/usr/bin/env bun import { defineAssertion } from '@agentv/eval'; -export default defineAssertion(({ outputText }) => { +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineAssertion(({ output }) => { + const outputText = getMessageText(output ?? []); const wordCount = outputText.trim().split(/\s+/).length; const minWords = 3; const pass = wordCount >= minWords; diff --git a/examples/showcase/evaluator-conformance/evaluators/keyword-grader.ts b/examples/showcase/evaluator-conformance/evaluators/keyword-grader.ts index de49a0d11..7612499b1 100644 --- a/examples/showcase/evaluator-conformance/evaluators/keyword-grader.ts +++ b/examples/showcase/evaluator-conformance/evaluators/keyword-grader.ts @@ -8,7 +8,28 @@ */ import { defineCodeGrader } from '@agentv/eval'; -export default defineCodeGrader(({ outputText, expectedOutputText, criteria }) => { +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ output, expectedOutput, criteria }) => { + const outputText = getMessageText(output ?? []); + const expectedOutputText = getMessageText(expectedOutput); const candidate = (outputText ?? '').toLowerCase().trim(); const expected = (expectedOutputText ?? '').toLowerCase().trim(); diff --git a/examples/showcase/export-screening/evals/validate_risk_output.ts b/examples/showcase/export-screening/evals/validate_risk_output.ts index a1ce9a8ca..8f98895d0 100644 --- a/examples/showcase/export-screening/evals/validate_risk_output.ts +++ b/examples/showcase/export-screening/evals/validate_risk_output.ts @@ -59,7 +59,27 @@ function extractExpectedRiskLevel( return null; } -export default defineCodeGrader(({ outputText, expectedOutput }) => { +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ output, expectedOutput }) => { + const outputText = getMessageText(output ?? []); const assertions: Array<{ text: string; passed: boolean; evidence?: string }> = []; // Parse candidate JSON diff --git a/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts b/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts index f19549310..b610470d5 100644 --- a/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts +++ b/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts @@ -100,9 +100,28 @@ function compareResponses( return { winner: 'TIE', aAdvantages, bAdvantages }; } +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + export default defineCodeGrader((input) => { - const candidate = input.outputText ?? ''; - const reference = input.expectedOutputText ?? ''; + const candidate = getMessageText(input.output ?? []); + const reference = getMessageText(input.expectedOutput); // If no reference, we can't do pairwise comparison if (!reference) { @@ -113,7 +132,7 @@ export default defineCodeGrader((input) => { { text: 'No reference for comparison', passed: false, - evidence: 'Pairwise comparison requires expectedOutputText field', + evidence: 'Pairwise comparison requires expected output messages', }, ], }; diff --git a/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts b/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts index 7dc3dba51..e9b694874 100644 --- a/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts +++ b/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts @@ -49,7 +49,27 @@ const toolTaskMappings: Record = { validate: ['check', 'validate', 'verify', 'confirm'], }; -export default defineCodeGrader(({ inputText, criteria, output }) => { +function getMessageText( + messages: readonly { role: string; content?: unknown }[], + role = 'assistant', +): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === role) { + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content + .filter((b: { type?: string }) => b.type === 'text') + .map((b: { text?: string }) => b.text) + .join('\n'); + } + } + } + return ''; +} + +export default defineCodeGrader(({ input, criteria, output }) => { + const inputText = getMessageText(input, 'user'); const assertions: Array<{ text: string; passed: boolean }> = []; const toolCalls = extractToolCalls(output ?? []); From 65299ed42bc16ca5a07db281ad44b3407503f0bd Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 03:44:37 +0000 Subject: [PATCH 5/6] style: fix pre-existing biome formatting issues Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../test/evaluation/validation/prompt-validator.test.ts | 6 +++--- packages/core/test/fixtures/test-grader.cjs | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/packages/core/test/evaluation/validation/prompt-validator.test.ts b/packages/core/test/evaluation/validation/prompt-validator.test.ts index e3ac9c43c..9a189c953 100644 --- a/packages/core/test/evaluation/validation/prompt-validator.test.ts +++ b/packages/core/test/evaluation/validation/prompt-validator.test.ts @@ -24,9 +24,9 @@ describe('validateTemplateVariables', () => { }); it('throws when no required or deprecated variables are present', () => { - expect(() => - validateTemplateVariables('No variables here', 'test.txt'), - ).toThrow('Missing required fields'); + expect(() => validateTemplateVariables('No variables here', 'test.txt')).toThrow( + 'Missing required fields', + ); }); it('throws when only non-required variables are present', () => { diff --git a/packages/core/test/fixtures/test-grader.cjs b/packages/core/test/fixtures/test-grader.cjs index a8957844c..e341fb69f 100644 --- a/packages/core/test/fixtures/test-grader.cjs +++ b/packages/core/test/fixtures/test-grader.cjs @@ -6,7 +6,9 @@ const input = JSON.parse(fs.readFileSync(0, 'utf8')); const hasExpected = Array.isArray(input.expected_output); // Extract candidate text from the output message array const outputMessages = Array.isArray(input.output) ? input.output : []; -const candidateText = outputMessages.map((m) => (typeof m.content === 'string' ? m.content : JSON.stringify(m.content))).join(''); +const candidateText = outputMessages + .map((m) => (typeof m.content === 'string' ? m.content : JSON.stringify(m.content))) + .join(''); const hasCandidate = candidateText.length > 0; let candidateDecisionOk = false; From f29d165fb26e85cbc83498b6f4b855ec9177a7db Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 04:18:03 +0000 Subject: [PATCH 6/6] fix(cli): update assertion scaffold to use Message arrays instead of removed outputText The outputText field was removed from CodeGraderInput. Update the agentv create assertion template to extract text from output Message arrays using a getMessageText() helper. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- apps/cli/src/commands/create/commands.ts | 36 +++++++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/apps/cli/src/commands/create/commands.ts b/apps/cli/src/commands/create/commands.ts index 0f2a94bde..762150601 100644 --- a/apps/cli/src/commands/create/commands.ts +++ b/apps/cli/src/commands/create/commands.ts @@ -6,9 +6,23 @@ const ASSERTION_TEMPLATES: Record = { default: `#!/usr/bin/env bun import { defineAssertion } from '@agentv/eval'; -export default defineAssertion(({ outputText }) => { +/** Extract text from the last message with the given role. */ +function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role !== role) continue; + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n'); + } + } + return ''; +} + +export default defineAssertion(({ output }) => { // TODO: Implement your assertion logic - const pass = outputText.length > 0; + const text = getMessageText(output ?? []); + const pass = text.length > 0; return { pass, reasoning: pass ? 'Output has content' : 'Output is empty', @@ -18,9 +32,23 @@ export default defineAssertion(({ outputText }) => { score: `#!/usr/bin/env bun import { defineAssertion } from '@agentv/eval'; -export default defineAssertion(({ outputText }) => { +/** Extract text from the last message with the given role. */ +function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role !== role) continue; + if (typeof msg.content === 'string') return msg.content; + if (Array.isArray(msg.content)) { + return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n'); + } + } + return ''; +} + +export default defineAssertion(({ output }) => { // TODO: Implement your scoring logic (0.0 to 1.0) - const score = outputText.length > 0 ? 1.0 : 0.0; + const text = getMessageText(output ?? []); + const score = text.length > 0 ? 1.0 : 0.0; return { pass: score >= 0.5, score,