From e9888b38c94b17f3653f8450a46fc8da39897dd9 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 10 Jun 2026 04:54:53 +0200 Subject: [PATCH 1/3] feat(core): add structured llm grader inputs --- .../content/docs/docs/graders/llm-graders.mdx | 51 ++++++-- .../src/evaluation/graders/code-grader.ts | 2 + .../evaluation/graders/llm-grader-prompt.ts | 106 ++++++++++++++--- .../core/src/evaluation/graders/llm-grader.ts | 111 ++++++++++-------- .../evaluation/graders/prompt-resolution.ts | 2 + .../src/evaluation/loaders/grader-parser.ts | 2 +- .../core/src/evaluation/template-variables.ts | 12 ++ packages/core/src/evaluation/types.ts | 2 + packages/core/src/evaluation/yaml-parser.ts | 44 ++++--- .../evaluation/evaluators_variables.test.ts | 59 ++++++++++ .../graders/prompt-resolution.test.ts | 8 ++ .../evaluation/yaml-parser-metadata.test.ts | 55 +++++++++ packages/eval/src/schemas.ts | 2 + 13 files changed, 362 insertions(+), 94 deletions(-) diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx index 14f88ba6a..cc40ab2c1 100644 --- a/apps/web/src/content/docs/docs/graders/llm-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx @@ -29,7 +29,7 @@ Reference an LLM grader in your eval file: assertions: - name: semantic_check type: llm-grader - prompt: ./graders/correctness.md + prompt: file://graders/correctness.md target: grader_gpt_5_mini # optional: route this grader to a named LLM target ``` @@ -69,12 +69,45 @@ Score the response from 0.0 to 1.0 based on: | `output_text` | Last candidate response content | | `expected_output_text` | Last expected message content | | `criteria` | Test `criteria` field | -| `input` | Full resolved input array, JSON-serialized | -| `expected_output` | Full resolved expected array, JSON-serialized | -| `output` | Full provider output array, JSON-serialized | +| `input` | Resolved input text | +| `expected_output` | Reference answer text | +| `output` | Candidate answer text | +| `metadata` | Test metadata as formatted JSON | +| `metadata_json` | Test metadata as compact JSON | +| `input_object` | Test `input_object` as formatted JSON | +| `input_object_json` | Test `input_object` as compact JSON | +| `rubrics` | LLM-grader rubric items as formatted JSON | +| `rubrics_json` | LLM-grader rubric items as compact JSON | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) | +Use `prompt: file://path/to/prompt.md` to reuse a markdown prompt file. Bare `prompt: "..."` strings are treated as inline prompt text, not file paths. + +Suite-level `metadata` is inherited by every test. When rubric items vary per test, keep the grader on each test and reuse the prompt file: + +```yaml +metadata: + source_repo: https://github.com/virattt/dexter + source_commit: 8d9419829f443f84b804d033bb2c3b1fbd788629 + source_file: src/evals/dataset/finance_agent.csv + +tests: + - id: apple-research + input: Research Apple + input_object: + company: Apple + ticker: AAPL + metadata: + row: 1 + assertions: + - name: dexter_semantic + type: llm-grader + prompt: file://prompts/dexter-grader.md + rubrics: + - operator: correctness + criteria: Uses the provided ticker and company. +``` + ## Per-Grader Target By default, an `llm-grader` uses the suite target's `grader_target`. Override it per grader when you need multiple grader models in one run: @@ -193,6 +226,8 @@ TypeScript templates receive a context object with these fields: | `expectedOutput` | `Message[]` | Full resolved expected output | | `output` | `Message[]` | Full provider output messages | | `trace` | `TraceSummary` | Execution metrics summary | +| `inputObject` | `unknown` | Optional structured `input_object` payload | +| `metadata` | `object` | Test metadata after suite defaults are merged | | `config` | `object` | Custom config from YAML | ## Template Variable Derivation @@ -225,9 +260,11 @@ Derived strings injected into grader prompts: | `criteria` | Passed through from the test field | | `expected_output_text` | Content of the last entry in `expected_output` | | `output_text` | Content of the last entry in `output` | -| `input` | Full resolved input array, JSON-serialized | -| `expected_output` | Full resolved expected array, JSON-serialized | -| `output` | Full provider output array, JSON-serialized | +| `input` | Resolved input text | +| `expected_output` | Reference answer text | +| `output` | Candidate answer text | +| `metadata_json` | Test metadata, compact JSON | +| `input_object_json` | Structured test input object, compact JSON | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) | diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index b672ab32d..1aca3abd6 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -168,6 +168,8 @@ export class CodeGrader implements Grader { context.evalCase.input as readonly Record[], getImageDir, ), + inputObject: context.evalCase.inputObject ?? null, + metadata: context.evalCase.metadata ?? null, trace: context.trace ?? null, tokenUsage: context.tokenUsage ?? null, costUsd: context.costUsd ?? null, diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts index bb78dd39a..fc12ff0ef 100644 --- a/packages/core/src/evaluation/graders/llm-grader-prompt.ts +++ b/packages/core/src/evaluation/graders/llm-grader-prompt.ts @@ -18,6 +18,46 @@ export interface LlmGraderPromptAssembly { mode: 'freeform' | 'checklist' | 'score_range'; } +function stringifyPretty(value: unknown): string { + return value === undefined ? '' : JSON.stringify(value, null, 2); +} + +function stringifyCompact(value: unknown): string { + return value === undefined ? '' : JSON.stringify(value); +} + +function buildTemplateVariables(input: { + evalCase: EvalTest; + candidate: string; + promptInputs: PromptInputs; + rubrics?: readonly RubricItem[]; + fileChanges?: string; + toolCalls?: string; +}): Record { + const formattedQuestion = + input.promptInputs.question && input.promptInputs.question.trim().length > 0 + ? input.promptInputs.question + : input.evalCase.question; + + return { + [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT]: input.candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (input.evalCase.reference_answer ?? '').trim(), + [TEMPLATE_VARIABLES.CRITERIA]: input.evalCase.criteria.trim(), + [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(input.evalCase.metadata), + [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(input.evalCase.metadata), + [TEMPLATE_VARIABLES.INPUT_OBJECT]: stringifyPretty(input.evalCase.inputObject), + [TEMPLATE_VARIABLES.INPUT_OBJECT_JSON]: stringifyCompact(input.evalCase.inputObject), + [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(input.rubrics), + [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(input.rubrics), + [TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? '', + [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT_TEXT]: input.candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (input.evalCase.reference_answer ?? '').trim(), + }; +} + export function assembleLlmGraderPrompt(input: { evalCase: EvalTest; candidate: string; @@ -42,6 +82,17 @@ export function assembleLlmGraderPrompt(input: { // Detect mode if (rubrics && rubrics.length > 0) { + if (graderTemplateOverride) { + return assembleCustom( + evalCase, + candidate, + promptInputs, + rubrics, + fileChanges, + toolCalls, + graderTemplateOverride, + ); + } const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0); if (hasScoreRanges) { return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls); @@ -67,23 +118,13 @@ function assembleFreeform( toolCalls?: string, graderTemplateOverride?: string, ): LlmGraderPromptAssembly { - const formattedQuestion = - promptInputs.question && promptInputs.question.trim().length > 0 - ? promptInputs.question - : evalCase.question; - - const variables = { - [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? '', - [TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? '', - // Deprecated aliases - [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? '').trim(), - }; + const variables = buildTemplateVariables({ + evalCase, + candidate, + promptInputs, + fileChanges, + toolCalls, + }); const systemPrompt = buildOutputSchema(); const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE; @@ -105,6 +146,37 @@ function assembleFreeform( }; } +function assembleCustom( + evalCase: EvalTest, + candidate: string, + promptInputs: PromptInputs, + rubrics: readonly RubricItem[], + fileChanges: string | undefined, + toolCalls: string | undefined, + graderTemplateOverride: string, +): LlmGraderPromptAssembly { + const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0); + const systemPrompt = hasScoreRanges ? buildScoreRangeOutputSchema() : buildRubricOutputSchema(); + const userPrompt = substituteVariables( + graderTemplateOverride, + buildTemplateVariables({ + evalCase, + candidate, + promptInputs, + rubrics, + fileChanges, + toolCalls, + }), + ); + + return { + systemPrompt, + userPrompt, + responseSchema: systemPrompt, + mode: hasScoreRanges ? 'score_range' : 'checklist', + }; +} + function assembleChecklist( evalCase: EvalTest, candidate: string, diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts index 3b6f58234..5092da81f 100644 --- a/packages/core/src/evaluation/graders/llm-grader.ts +++ b/packages/core/src/evaluation/graders/llm-grader.ts @@ -155,6 +155,41 @@ interface StructuredGenerationResult { readonly tokenUsage?: TokenUsage; } +function stringifyPretty(value: unknown): string { + return value === undefined ? '' : JSON.stringify(value, null, 2); +} + +function stringifyCompact(value: unknown): string { + return value === undefined ? '' : JSON.stringify(value); +} + +function buildTemplateVariables(context: EvaluationContext): Record { + const formattedQuestion = + context.promptInputs.question && context.promptInputs.question.trim().length > 0 + ? context.promptInputs.question + : context.evalCase.question; + const rubrics = context.evaluator?.type === 'llm-grader' ? context.evaluator.rubrics : undefined; + + return { + [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), + [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), + [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(context.evalCase.metadata), + [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(context.evalCase.metadata), + [TEMPLATE_VARIABLES.INPUT_OBJECT]: stringifyPretty(context.evalCase.inputObject), + [TEMPLATE_VARIABLES.INPUT_OBJECT_JSON]: stringifyCompact(context.evalCase.inputObject), + [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics), + [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics), + [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', + // Deprecated aliases — same values as the primary variables above + [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(), + }; +} + function resolveContentBasePath(context: EvaluationContext): string | undefined { if (context.workspacePath) { return context.workspacePath; @@ -259,25 +294,7 @@ export class LlmGrader implements Grader { context: EvaluationContext, graderProvider: Provider, ): Promise { - const formattedQuestion = - context.promptInputs.question && context.promptInputs.question.trim().length > 0 - ? context.promptInputs.question - : context.evalCase.question; - - // Prepare template variables for substitution. - // Primary variables resolve to human-readable text; deprecated _text aliases map to the same values. - const variables = { - [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', - [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', - // Deprecated aliases — same values as the primary variables above - [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(), - }; + const variables = buildTemplateVariables(context); // Build system prompt (only the mandatory output schema) const systemPrompt = buildOutputSchema(); @@ -367,7 +384,10 @@ export class LlmGrader implements Grader { return this.evaluateWithScoreRanges(context, graderProvider, rubrics); } - const prompt = this.buildRubricPrompt(context, rubrics); + const prompt = + context.graderTemplateOverride || this.graderTemplate + ? this.buildCustomPrompt(context) + : this.buildRubricPrompt(context, rubrics); const systemPrompt = buildRubricOutputSchema(); const graderRawRequest: JsonObject = { @@ -423,7 +443,10 @@ export class LlmGrader implements Grader { graderProvider: Provider, rubrics: readonly RubricItem[], ): Promise { - const prompt = this.buildScoreRangePrompt(context, rubrics); + const prompt = + context.graderTemplateOverride || this.graderTemplate + ? this.buildCustomPrompt(context) + : this.buildScoreRangePrompt(context, rubrics); const systemPrompt = buildScoreRangeOutputSchema(); const graderRawRequest: JsonObject = { @@ -688,22 +711,12 @@ export class LlmGrader implements Grader { ? context.promptInputs.question : context.evalCase.question; - const variables: Record = { - [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', - [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', - // Deprecated aliases - [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(), - }; + const variables = buildTemplateVariables(context); - if (this.graderTemplate) { - warnDeprecatedTemplateVars(this.graderTemplate); - return substituteVariables(this.graderTemplate, variables); + const template = context.graderTemplateOverride ?? this.graderTemplate; + if (template) { + warnDeprecatedTemplateVars(template); + return substituteVariables(template, variables); } const config = context.evaluator; @@ -767,21 +780,11 @@ export class LlmGrader implements Grader { const config = context.evaluator; const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined; - if (this.graderTemplate) { - const variables: Record = { - [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', - [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', - // Deprecated aliases - [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(), - }; - warnDeprecatedTemplateVars(this.graderTemplate); - const customPrompt = substituteVariables(this.graderTemplate, variables); + const template = context.graderTemplateOverride ?? this.graderTemplate; + if (template) { + const variables = buildTemplateVariables(context); + warnDeprecatedTemplateVars(template); + const customPrompt = substituteVariables(template, variables); const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema(); @@ -984,6 +987,12 @@ export class LlmGrader implements Grader { return parts.join('\n'); } + private buildCustomPrompt(context: EvaluationContext): string { + const template = context.graderTemplateOverride ?? this.graderTemplate ?? ''; + warnDeprecatedTemplateVars(template); + return substituteVariables(template, buildTemplateVariables(context)); + } + private buildRubricPrompt(context: EvaluationContext, rubrics: readonly RubricItem[]): string { const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 diff --git a/packages/core/src/evaluation/graders/prompt-resolution.ts b/packages/core/src/evaluation/graders/prompt-resolution.ts index 4e9bf7a5a..0e90fd47f 100644 --- a/packages/core/src/evaluation/graders/prompt-resolution.ts +++ b/packages/core/src/evaluation/graders/prompt-resolution.ts @@ -103,6 +103,8 @@ async function executePromptTemplate( output: context.output ?? null, inputFiles: context.evalCase.file_paths, input: context.evalCase.input, + inputObject: context.evalCase.inputObject ?? null, + metadata: context.evalCase.metadata ?? null, trace: context.trace ?? null, fileChanges: context.fileChanges ?? null, workspacePath: context.workspacePath ?? null, diff --git a/packages/core/src/evaluation/loaders/grader-parser.ts b/packages/core/src/evaluation/loaders/grader-parser.ts index 92ba2832b..7fa75aea9 100644 --- a/packages/core/src/evaluation/loaders/grader-parser.ts +++ b/packages/core/src/evaluation/loaders/grader-parser.ts @@ -2143,7 +2143,7 @@ function parseRubricItems( } const id = asString(rawRubric.id) ?? `rubric-${index + 1}`; - const expectedOutcome = asString(rawRubric.outcome) ?? ''; + const expectedOutcome = asString(rawRubric.outcome) ?? asString(rawRubric.criteria) ?? ''; const operator = parseRubricOperator(rawRubric.operator, id, evaluatorName, evalId); const weight = typeof rawRubric.weight === 'number' ? rawRubric.weight : 1.0; diff --git a/packages/core/src/evaluation/template-variables.ts b/packages/core/src/evaluation/template-variables.ts index 9d92f0d87..0fd268e09 100644 --- a/packages/core/src/evaluation/template-variables.ts +++ b/packages/core/src/evaluation/template-variables.ts @@ -7,6 +7,12 @@ * - {{ output }} — last assistant message as plain text * - {{ expected_output }} — reference answer as plain text * - {{ criteria }} — evaluation criteria string + * - {{ metadata }} — per-test metadata as formatted JSON + * - {{ metadata_json }} — per-test metadata as compact JSON + * - {{ input_object }} — per-test structured input object as formatted JSON + * - {{ input_object_json }} — per-test structured input object as compact JSON + * - {{ rubrics }} — llm-grader rubrics as formatted JSON + * - {{ rubrics_json }} — llm-grader rubrics as compact JSON * - {{ file_changes }} — file diff (if available) * - {{ tool_calls }} — formatted summary of tool calls from agent execution * @@ -18,6 +24,12 @@ export const TEMPLATE_VARIABLES = { EXPECTED_OUTPUT: 'expected_output', CRITERIA: 'criteria', + METADATA: 'metadata', + METADATA_JSON: 'metadata_json', + INPUT_OBJECT: 'input_object', + INPUT_OBJECT_JSON: 'input_object_json', + RUBRICS: 'rubrics', + RUBRICS_JSON: 'rubrics_json', INPUT: 'input', OUTPUT: 'output', FILE_CHANGES: 'file_changes', diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 7e764ff8b..1b31f5bfc 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -1009,6 +1009,8 @@ export interface EvalTest { readonly conversation_id?: string; readonly question: string; readonly input: readonly TestMessage[]; + /** Optional structured per-case input payload for grader prompt templates. */ + readonly inputObject?: JsonValue; readonly expected_output: readonly JsonObject[]; readonly reference_answer?: string; readonly file_paths: readonly string[]; diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 8bf48d414..35c9c34e9 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -64,7 +64,7 @@ import type { WorkspaceHooksConfig, WorkspaceScriptConfig, } from './types.js'; -import { isJsonObject, isTestMessage } from './types.js'; +import { isJsonObject, isJsonValue, isTestMessage } from './types.js'; import { parseRepoConfig } from './workspace/repo-config-parser.js'; import { parseYamlValue } from './yaml-loader.js'; @@ -119,6 +119,8 @@ type RawTestSuite = JsonObject & { /** @deprecated Use `assertions` instead */ readonly assert?: JsonValue; readonly input?: JsonValue; + readonly metadata?: JsonValue; + readonly governance?: JsonValue; /** Shorthand: list of file paths to prepend as type:file content blocks in each test's user message. */ readonly input_files?: JsonValue; // Suite-level metadata fields @@ -140,6 +142,7 @@ type RawEvalCase = JsonObject & { /** @deprecated Use `criteria` instead */ readonly expected_outcome?: JsonValue; readonly input?: JsonValue; + readonly input_object?: JsonValue; /** Shorthand: list of file paths to prepend as type:file content blocks in the user message. */ readonly input_files?: JsonValue; readonly expected_output?: JsonValue; @@ -431,9 +434,9 @@ async function loadTestsFromYaml( const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir); - // Suite-level governance block (top-level `governance:` wins over `metadata.governance:`). - // Merged into each case's `metadata.governance` via mergeSuiteMetadataPayload. - const suiteGovernance = extractSuiteGovernance(suite); + // Suite-level metadata defaults. Top-level `metadata:` is inherited by each case. + // Top-level `governance:` wins over `metadata.governance:` for compatibility. + const suiteMetadataPayload = extractSuiteMetadataPayload(suite); const rawSuiteInput = suite.input; const rawSuiteInputFiles = suite.input_files; @@ -631,9 +634,10 @@ async function loadTestsFromYaml( const rawCaseMetadata = isJsonObject(renderedCase.metadata) ? (renderedCase.metadata as Record) : undefined; - const suitePayload = - suiteGovernance !== undefined ? { governance: suiteGovernance } : undefined; - const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suitePayload); + const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload); + const inputObject = isJsonValue(renderedCase.input_object) + ? renderedCase.input_object + : undefined; // Extract per-test targets override (matrix evaluation) const caseTargets = extractTargetsFromTestCase(renderedCase as JsonObject); @@ -679,6 +683,7 @@ async function loadTestsFromYaml( conversation_id: conversationId, question: question, input: inputMessages, + ...(inputObject !== undefined ? { inputObject } : {}), expected_output: outputSegments, reference_answer: referenceAnswer, file_paths: userFilePaths, @@ -1328,23 +1333,26 @@ function asString(value: unknown): string | undefined { } /** - * Pull the optional `governance` block out of a suite YAML. Top-level `governance:` wins - * over the nested `metadata.governance:` form so that authors who already use top-level - * suite metadata fields (`name`, `description`, `tags`) can keep their existing layout. + * Build metadata defaults inherited by each test case. Top-level `metadata:` carries + * arbitrary domain/source fields; top-level `governance:` wins over nested + * `metadata.governance:` so existing governance evals keep their precedence. */ -function extractSuiteGovernance(suite: RawTestSuite): Record | undefined { +function extractSuiteMetadataPayload(suite: RawTestSuite): Record | undefined { + const payload = isJsonObject(suite.metadata) + ? ({ ...(suite.metadata as Record) } as Record) + : {}; + const top = (suite as JsonObject).governance; if (isJsonObject(top)) { - return top as Record; - } - const wrapper = (suite as JsonObject).metadata; - if (isJsonObject(wrapper)) { - const nested = (wrapper as JsonObject).governance; + payload.governance = top as Record; + } else { + const nested = payload.governance; if (isJsonObject(nested)) { - return nested as Record; + payload.governance = nested as Record; } } - return undefined; + + return Object.keys(payload).length > 0 ? payload : undefined; } /** diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts index dbd925480..008e8d80f 100644 --- a/packages/core/test/evaluation/evaluators_variables.test.ts +++ b/packages/core/test/evaluation/evaluators_variables.test.ts @@ -96,6 +96,65 @@ File Changes: {{file_changes}} expect(request?.systemPrompt).not.toContain(`Question: ${formattedQuestion}`); }); + it('substitutes structured metadata, input_object, and rubrics variables', async () => { + const customPrompt = ` +Metadata: {{metadata_json}} +Input Object: {{input_object_json}} +Rubrics: {{rubrics_json}} +Candidate: {{output}} +`; + + const graderProvider = new CapturingProvider({ + output: [ + { + role: 'assistant', + content: JSON.stringify({ + checks: [{ id: 'factual', satisfied: true, reasoning: 'Matches' }], + overall_reasoning: 'OK', + }), + }, + ], + }); + + const evaluator = new LlmGrader({ + resolveGraderProvider: async () => graderProvider, + graderTemplate: customPrompt, + }); + + await evaluator.evaluate({ + evalCase: { + ...baseTestCase, + inputObject: { company: 'Apple', ticker: 'AAPL' }, + metadata: { source_repo: 'https://github.com/virattt/dexter' }, + }, + candidate: 'Apple revenue increased.', + target: baseTarget, + provider: graderProvider, + attempt: 0, + promptInputs: { question: 'Research Apple' }, + now: new Date(), + evaluator: { + name: 'dexter', + type: 'llm-grader', + rubrics: [ + { + id: 'factual', + operator: 'correctness', + outcome: 'Uses the supplied ticker', + weight: 1, + required: true, + }, + ], + }, + }); + + const prompt = graderProvider.lastRequest?.question ?? ''; + expect(prompt).toContain('"source_repo":"https://github.com/virattt/dexter"'); + expect(prompt).toContain('"ticker":"AAPL"'); + expect(prompt).toContain('"operator":"correctness"'); + expect(prompt).toContain('Candidate: Apple revenue increased.'); + }); + it('deprecated _text aliases still resolve correctly', async () => { const formattedQuestion = 'What is 2+2?'; const customPrompt = ` diff --git a/packages/core/test/evaluation/graders/prompt-resolution.test.ts b/packages/core/test/evaluation/graders/prompt-resolution.test.ts index 02ef7d946..236acf9cb 100644 --- a/packages/core/test/evaluation/graders/prompt-resolution.test.ts +++ b/packages/core/test/evaluation/graders/prompt-resolution.test.ts @@ -26,6 +26,14 @@ describe('containsTemplateVariables', () => { expect(containsTemplateVariables('Review {{file_changes}}')).toBe(true); }); + it('returns true for structured template variables', () => { + expect( + containsTemplateVariables( + 'Review {{metadata_json}}, {{input_object_json}}, and {{rubrics_json}} against {{output}}', + ), + ).toBe(true); + }); + it('returns true for deprecated {{output_text}} variable', () => { expect(containsTemplateVariables('Grade the {{output_text}}')).toBe(true); }); diff --git a/packages/core/test/evaluation/yaml-parser-metadata.test.ts b/packages/core/test/evaluation/yaml-parser-metadata.test.ts index 12dea2bde..08280300b 100644 --- a/packages/core/test/evaluation/yaml-parser-metadata.test.ts +++ b/packages/core/test/evaluation/yaml-parser-metadata.test.ts @@ -173,4 +173,59 @@ tests: owasp_llm_top_10_2025: ['LLM01'], }); }); + + it('merges arbitrary suite metadata into each case and lets case scalars override', async () => { + const { filePath, dir } = createTempYaml(` +metadata: + source_repo: https://github.com/virattt/dexter + source_commit: 8d9419829f443f84b804d033bb2c3b1fbd788629 + source_file: src/evals/dataset/finance_agent.csv + tags: [suite] +tests: + - id: case-1 + criteria: "Answer" + input: "Query" + metadata: + source_file: override.csv + tags: [case] +`); + + const suite = await loadTestSuite(filePath, dir); + expect(suite.tests[0].metadata).toMatchObject({ + source_repo: 'https://github.com/virattt/dexter', + source_commit: '8d9419829f443f84b804d033bb2c3b1fbd788629', + source_file: 'override.csv', + tags: ['suite', 'case'], + }); + }); + + it('loads structured input_object and rubric criteria aliases', async () => { + const { filePath, dir } = createTempYaml(` +tests: + - id: case-1 + input: "Research Apple" + input_object: + company: Apple + ticker: AAPL + assertions: + - name: dexter_rubric + type: llm-grader + rubrics: + - id: factual + operator: correctness + criteria: "Uses the supplied company and ticker" +`); + + const suite = await loadTestSuite(filePath, dir); + expect(suite.tests[0].inputObject).toEqual({ company: 'Apple', ticker: 'AAPL' }); + const grader = suite.tests[0].assertions?.[0]; + expect(grader?.type).toBe('llm-grader'); + if (grader?.type === 'llm-grader') { + expect(grader.rubrics?.[0]).toMatchObject({ + id: 'factual', + operator: 'correctness', + outcome: 'Uses the supplied company and ticker', + }); + } + }); }); diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index 374ca6651..d92e5604f 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -276,6 +276,8 @@ export const CodeGraderInputSchema = z.object({ outputPath: z.string().optional(), inputFiles: z.array(z.string()), input: z.array(MessageSchema), + inputObject: z.unknown().nullable().optional(), + metadata: z.record(z.unknown()).nullable().optional(), trace: TraceSummarySchema.nullable().optional(), tokenUsage: TokenUsageSchema.nullable().optional(), costUsd: z.number().nullable().optional(), From dcdde97d0c488e9d9266dec299cd12b0881073fd Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 10 Jun 2026 05:24:57 +0200 Subject: [PATCH 2/3] docs: clarify optional llm grader input object --- apps/web/src/content/docs/docs/graders/llm-graders.mdx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx index cc40ab2c1..85b78145f 100644 --- a/apps/web/src/content/docs/docs/graders/llm-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx @@ -74,8 +74,8 @@ Score the response from 0.0 to 1.0 based on: | `output` | Candidate answer text | | `metadata` | Test metadata as formatted JSON | | `metadata_json` | Test metadata as compact JSON | -| `input_object` | Test `input_object` as formatted JSON | -| `input_object_json` | Test `input_object` as compact JSON | +| `input_object` | Optional grader-only `input_object` payload as formatted JSON | +| `input_object_json` | Optional grader-only `input_object` payload as compact JSON | | `rubrics` | LLM-grader rubric items as formatted JSON | | `rubrics_json` | LLM-grader rubric items as compact JSON | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | @@ -83,7 +83,9 @@ Score the response from 0.0 to 1.0 based on: Use `prompt: file://path/to/prompt.md` to reuse a markdown prompt file. Bare `prompt: "..."` strings are treated as inline prompt text, not file paths. -Suite-level `metadata` is inherited by every test. When rubric items vary per test, keep the grader on each test and reuse the prompt file: +`input_object` is optional grader-only structured data. Use it when the grader prompt needs a stable machine-readable payload through `{{input_object_json}}` while the agent-facing `input` remains a natural-language prompt or message array. If your existing `input` already contains the needed object (for example, a message whose `content` is a JSON object), keep using `input` and reference `{{input}}`; you do not need to duplicate that data into `input_object`. + +Suite-level `metadata` is inherited by every test. When rubric items vary per test and you want a separate grader-only payload, keep the grader on each test and reuse the prompt file: ```yaml metadata: @@ -264,7 +266,7 @@ Derived strings injected into grader prompts: | `expected_output` | Reference answer text | | `output` | Candidate answer text | | `metadata_json` | Test metadata, compact JSON | -| `input_object_json` | Structured test input object, compact JSON | +| `input_object_json` | Optional grader-only structured payload, compact JSON | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) | From 02bd6982d03ee9faede6b94a478d271ee1a76660 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 10 Jun 2026 05:48:39 +0200 Subject: [PATCH 3/3] refactor(core): reuse input for structured llm grader data --- .../web/src/content/docs/docs/graders/llm-graders.mdx | 11 +++-------- packages/core/src/evaluation/graders/code-grader.ts | 1 - .../core/src/evaluation/graders/llm-grader-prompt.ts | 2 -- packages/core/src/evaluation/graders/llm-grader.ts | 2 -- .../core/src/evaluation/graders/prompt-resolution.ts | 1 - .../src/evaluation/loaders/shorthand-expansion.ts | 10 ++++++++++ packages/core/src/evaluation/template-variables.ts | 4 ---- packages/core/src/evaluation/types.ts | 2 -- packages/core/src/evaluation/yaml-parser.ts | 8 +------- .../core/test/evaluation/evaluators_variables.test.ts | 10 +++++----- .../test/evaluation/graders/prompt-resolution.test.ts | 2 +- .../core/test/evaluation/yaml-parser-metadata.test.ts | 8 ++++---- packages/eval/src/schemas.ts | 1 - 13 files changed, 24 insertions(+), 38 deletions(-) diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx index 85b78145f..ec5241805 100644 --- a/apps/web/src/content/docs/docs/graders/llm-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx @@ -74,8 +74,6 @@ Score the response from 0.0 to 1.0 based on: | `output` | Candidate answer text | | `metadata` | Test metadata as formatted JSON | | `metadata_json` | Test metadata as compact JSON | -| `input_object` | Optional grader-only `input_object` payload as formatted JSON | -| `input_object_json` | Optional grader-only `input_object` payload as compact JSON | | `rubrics` | LLM-grader rubric items as formatted JSON | | `rubrics_json` | LLM-grader rubric items as compact JSON | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | @@ -83,9 +81,9 @@ Score the response from 0.0 to 1.0 based on: Use `prompt: file://path/to/prompt.md` to reuse a markdown prompt file. Bare `prompt: "..."` strings are treated as inline prompt text, not file paths. -`input_object` is optional grader-only structured data. Use it when the grader prompt needs a stable machine-readable payload through `{{input_object_json}}` while the agent-facing `input` remains a natural-language prompt or message array. If your existing `input` already contains the needed object (for example, a message whose `content` is a JSON object), keep using `input` and reference `{{input}}`; you do not need to duplicate that data into `input_object`. +Structured task input belongs in `input`. If `input` is a message whose `content` is a JSON object, `{{input}}` renders that object as formatted JSON for the grader prompt; no separate grader-only input field is required. Use `metadata` for provenance or suite-level source fields, and `rubrics_json` for rubric arrays. -Suite-level `metadata` is inherited by every test. When rubric items vary per test and you want a separate grader-only payload, keep the grader on each test and reuse the prompt file: +Suite-level `metadata` is inherited by every test. When rubric items vary per test, keep the grader on each test and reuse the prompt file: ```yaml metadata: @@ -95,8 +93,7 @@ metadata: tests: - id: apple-research - input: Research Apple - input_object: + input: company: Apple ticker: AAPL metadata: @@ -228,7 +225,6 @@ TypeScript templates receive a context object with these fields: | `expectedOutput` | `Message[]` | Full resolved expected output | | `output` | `Message[]` | Full provider output messages | | `trace` | `TraceSummary` | Execution metrics summary | -| `inputObject` | `unknown` | Optional structured `input_object` payload | | `metadata` | `object` | Test metadata after suite defaults are merged | | `config` | `object` | Custom config from YAML | @@ -266,7 +262,6 @@ Derived strings injected into grader prompts: | `expected_output` | Reference answer text | | `output` | Candidate answer text | | `metadata_json` | Test metadata, compact JSON | -| `input_object_json` | Optional grader-only structured payload, compact JSON | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) | diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index 1aca3abd6..3ec89061b 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -168,7 +168,6 @@ export class CodeGrader implements Grader { context.evalCase.input as readonly Record[], getImageDir, ), - inputObject: context.evalCase.inputObject ?? null, metadata: context.evalCase.metadata ?? null, trace: context.trace ?? null, tokenUsage: context.tokenUsage ?? null, diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts index fc12ff0ef..fe79c525e 100644 --- a/packages/core/src/evaluation/graders/llm-grader-prompt.ts +++ b/packages/core/src/evaluation/graders/llm-grader-prompt.ts @@ -46,8 +46,6 @@ function buildTemplateVariables(input: { [TEMPLATE_VARIABLES.CRITERIA]: input.evalCase.criteria.trim(), [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(input.evalCase.metadata), [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(input.evalCase.metadata), - [TEMPLATE_VARIABLES.INPUT_OBJECT]: stringifyPretty(input.evalCase.inputObject), - [TEMPLATE_VARIABLES.INPUT_OBJECT_JSON]: stringifyCompact(input.evalCase.inputObject), [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(input.rubrics), [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(input.rubrics), [TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? '', diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts index 5092da81f..acdf85248 100644 --- a/packages/core/src/evaluation/graders/llm-grader.ts +++ b/packages/core/src/evaluation/graders/llm-grader.ts @@ -177,8 +177,6 @@ function buildTemplateVariables(context: EvaluationContext): Record [{ role: 'user', content: "What is 2+2?" }] + * - Object (without role key): { accuracy: 0.9 } -> [{ role: 'user', content: { accuracy: 0.9 } }] * - Array of messages: Already in message format, passthrough * * @param value The raw `input` value from YAML/JSONL @@ -30,6 +31,15 @@ export function expandInputShorthand(value: JsonValue | undefined): TestMessage[ return [{ role: 'user', content: value }]; } + // Object shorthand: single user message with structured content. + // If it already looks like a message, preserve the existing message shape. + if (isJsonObject(value)) { + if ('role' in value) { + return isTestMessage(value) ? [value] : undefined; + } + return [{ role: 'user', content: value }]; + } + // Array: should be message array if (Array.isArray(value)) { const messages = value.filter((msg): msg is TestMessage => isTestMessage(msg)); diff --git a/packages/core/src/evaluation/template-variables.ts b/packages/core/src/evaluation/template-variables.ts index 0fd268e09..81e13c36a 100644 --- a/packages/core/src/evaluation/template-variables.ts +++ b/packages/core/src/evaluation/template-variables.ts @@ -9,8 +9,6 @@ * - {{ criteria }} — evaluation criteria string * - {{ metadata }} — per-test metadata as formatted JSON * - {{ metadata_json }} — per-test metadata as compact JSON - * - {{ input_object }} — per-test structured input object as formatted JSON - * - {{ input_object_json }} — per-test structured input object as compact JSON * - {{ rubrics }} — llm-grader rubrics as formatted JSON * - {{ rubrics_json }} — llm-grader rubrics as compact JSON * - {{ file_changes }} — file diff (if available) @@ -26,8 +24,6 @@ export const TEMPLATE_VARIABLES = { CRITERIA: 'criteria', METADATA: 'metadata', METADATA_JSON: 'metadata_json', - INPUT_OBJECT: 'input_object', - INPUT_OBJECT_JSON: 'input_object_json', RUBRICS: 'rubrics', RUBRICS_JSON: 'rubrics_json', INPUT: 'input', diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 1b31f5bfc..7e764ff8b 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -1009,8 +1009,6 @@ export interface EvalTest { readonly conversation_id?: string; readonly question: string; readonly input: readonly TestMessage[]; - /** Optional structured per-case input payload for grader prompt templates. */ - readonly inputObject?: JsonValue; readonly expected_output: readonly JsonObject[]; readonly reference_answer?: string; readonly file_paths: readonly string[]; diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 35c9c34e9..c82592992 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -64,7 +64,7 @@ import type { WorkspaceHooksConfig, WorkspaceScriptConfig, } from './types.js'; -import { isJsonObject, isJsonValue, isTestMessage } from './types.js'; +import { isJsonObject, isTestMessage } from './types.js'; import { parseRepoConfig } from './workspace/repo-config-parser.js'; import { parseYamlValue } from './yaml-loader.js'; @@ -142,7 +142,6 @@ type RawEvalCase = JsonObject & { /** @deprecated Use `criteria` instead */ readonly expected_outcome?: JsonValue; readonly input?: JsonValue; - readonly input_object?: JsonValue; /** Shorthand: list of file paths to prepend as type:file content blocks in the user message. */ readonly input_files?: JsonValue; readonly expected_output?: JsonValue; @@ -635,10 +634,6 @@ async function loadTestsFromYaml( ? (renderedCase.metadata as Record) : undefined; const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload); - const inputObject = isJsonValue(renderedCase.input_object) - ? renderedCase.input_object - : undefined; - // Extract per-test targets override (matrix evaluation) const caseTargets = extractTargetsFromTestCase(renderedCase as JsonObject); @@ -683,7 +678,6 @@ async function loadTestsFromYaml( conversation_id: conversationId, question: question, input: inputMessages, - ...(inputObject !== undefined ? { inputObject } : {}), expected_output: outputSegments, reference_answer: referenceAnswer, file_paths: userFilePaths, diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts index 008e8d80f..835084494 100644 --- a/packages/core/test/evaluation/evaluators_variables.test.ts +++ b/packages/core/test/evaluation/evaluators_variables.test.ts @@ -96,10 +96,10 @@ File Changes: {{file_changes}} expect(request?.systemPrompt).not.toContain(`Question: ${formattedQuestion}`); }); - it('substitutes structured metadata, input_object, and rubrics variables', async () => { + it('substitutes structured input, metadata, and rubrics variables', async () => { const customPrompt = ` Metadata: {{metadata_json}} -Input Object: {{input_object_json}} +Input: {{input}} Rubrics: {{rubrics_json}} Candidate: {{output}} `; @@ -124,14 +124,14 @@ Candidate: {{output}} await evaluator.evaluate({ evalCase: { ...baseTestCase, - inputObject: { company: 'Apple', ticker: 'AAPL' }, + input: [{ role: 'user', content: { company: 'Apple', ticker: 'AAPL' } }], metadata: { source_repo: 'https://github.com/virattt/dexter' }, }, candidate: 'Apple revenue increased.', target: baseTarget, provider: graderProvider, attempt: 0, - promptInputs: { question: 'Research Apple' }, + promptInputs: { question: '{\n "company": "Apple",\n "ticker": "AAPL"\n}' }, now: new Date(), evaluator: { name: 'dexter', @@ -150,7 +150,7 @@ Candidate: {{output}} const prompt = graderProvider.lastRequest?.question ?? ''; expect(prompt).toContain('"source_repo":"https://github.com/virattt/dexter"'); - expect(prompt).toContain('"ticker":"AAPL"'); + expect(prompt).toContain('"ticker": "AAPL"'); expect(prompt).toContain('"operator":"correctness"'); expect(prompt).toContain('Candidate: Apple revenue increased.'); }); diff --git a/packages/core/test/evaluation/graders/prompt-resolution.test.ts b/packages/core/test/evaluation/graders/prompt-resolution.test.ts index 236acf9cb..1c17cec2f 100644 --- a/packages/core/test/evaluation/graders/prompt-resolution.test.ts +++ b/packages/core/test/evaluation/graders/prompt-resolution.test.ts @@ -29,7 +29,7 @@ describe('containsTemplateVariables', () => { it('returns true for structured template variables', () => { expect( containsTemplateVariables( - 'Review {{metadata_json}}, {{input_object_json}}, and {{rubrics_json}} against {{output}}', + 'Review {{metadata_json}} and {{rubrics_json}} against {{input}} and {{output}}', ), ).toBe(true); }); diff --git a/packages/core/test/evaluation/yaml-parser-metadata.test.ts b/packages/core/test/evaluation/yaml-parser-metadata.test.ts index 08280300b..0d46e7867 100644 --- a/packages/core/test/evaluation/yaml-parser-metadata.test.ts +++ b/packages/core/test/evaluation/yaml-parser-metadata.test.ts @@ -199,12 +199,11 @@ tests: }); }); - it('loads structured input_object and rubric criteria aliases', async () => { + it('loads structured input objects and rubric criteria aliases', async () => { const { filePath, dir } = createTempYaml(` tests: - id: case-1 - input: "Research Apple" - input_object: + input: company: Apple ticker: AAPL assertions: @@ -217,7 +216,8 @@ tests: `); const suite = await loadTestSuite(filePath, dir); - expect(suite.tests[0].inputObject).toEqual({ company: 'Apple', ticker: 'AAPL' }); + expect(suite.tests[0].input[0].content).toEqual({ company: 'Apple', ticker: 'AAPL' }); + expect(suite.tests[0].question).toContain('"ticker": "AAPL"'); const grader = suite.tests[0].assertions?.[0]; expect(grader?.type).toBe('llm-grader'); if (grader?.type === 'llm-grader') { diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index d92e5604f..4d2eb340e 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -276,7 +276,6 @@ export const CodeGraderInputSchema = z.object({ outputPath: z.string().optional(), inputFiles: z.array(z.string()), input: z.array(MessageSchema), - inputObject: z.unknown().nullable().optional(), metadata: z.record(z.unknown()).nullable().optional(), trace: TraceSummarySchema.nullable().optional(), tokenUsage: TokenUsageSchema.nullable().optional(),