diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx index 14f88ba6a..ec5241805 100644 --- a/apps/web/src/content/docs/docs/graders/llm-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx @@ -29,7 +29,7 @@ Reference an LLM grader in your eval file: assertions: - name: semantic_check type: llm-grader - prompt: ./graders/correctness.md + prompt: file://graders/correctness.md target: grader_gpt_5_mini # optional: route this grader to a named LLM target ``` @@ -69,12 +69,44 @@ Score the response from 0.0 to 1.0 based on: | `output_text` | Last candidate response content | | `expected_output_text` | Last expected message content | | `criteria` | Test `criteria` field | -| `input` | Full resolved input array, JSON-serialized | -| `expected_output` | Full resolved expected array, JSON-serialized | -| `output` | Full provider output array, JSON-serialized | +| `input` | Resolved input text | +| `expected_output` | Reference answer text | +| `output` | Candidate answer text | +| `metadata` | Test metadata as formatted JSON | +| `metadata_json` | Test metadata as compact JSON | +| `rubrics` | LLM-grader rubric items as formatted JSON | +| `rubrics_json` | LLM-grader rubric items as compact JSON | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) | +Use `prompt: file://path/to/prompt.md` to reuse a markdown prompt file. Bare `prompt: "..."` strings are treated as inline prompt text, not file paths. + +Structured task input belongs in `input`. If `input` is a message whose `content` is a JSON object, `{{input}}` renders that object as formatted JSON for the grader prompt; no separate grader-only input field is required. Use `metadata` for provenance or suite-level source fields, and `rubrics_json` for rubric arrays. + +Suite-level `metadata` is inherited by every test. When rubric items vary per test, keep the grader on each test and reuse the prompt file: + +```yaml +metadata: + source_repo: https://github.com/virattt/dexter + source_commit: 8d9419829f443f84b804d033bb2c3b1fbd788629 + source_file: src/evals/dataset/finance_agent.csv + +tests: + - id: apple-research + input: + company: Apple + ticker: AAPL + metadata: + row: 1 + assertions: + - name: dexter_semantic + type: llm-grader + prompt: file://prompts/dexter-grader.md + rubrics: + - operator: correctness + criteria: Uses the provided ticker and company. +``` + ## Per-Grader Target By default, an `llm-grader` uses the suite target's `grader_target`. Override it per grader when you need multiple grader models in one run: @@ -193,6 +225,7 @@ TypeScript templates receive a context object with these fields: | `expectedOutput` | `Message[]` | Full resolved expected output | | `output` | `Message[]` | Full provider output messages | | `trace` | `TraceSummary` | Execution metrics summary | +| `metadata` | `object` | Test metadata after suite defaults are merged | | `config` | `object` | Custom config from YAML | ## Template Variable Derivation @@ -225,9 +258,10 @@ Derived strings injected into grader prompts: | `criteria` | Passed through from the test field | | `expected_output_text` | Content of the last entry in `expected_output` | | `output_text` | Content of the last entry in `output` | -| `input` | Full resolved input array, JSON-serialized | -| `expected_output` | Full resolved expected array, JSON-serialized | -| `output` | Full provider output array, JSON-serialized | +| `input` | Resolved input text | +| `expected_output` | Reference answer text | +| `output` | Candidate answer text | +| `metadata_json` | Test metadata, compact JSON | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) | diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index b672ab32d..3ec89061b 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -168,6 +168,7 @@ export class CodeGrader implements Grader { context.evalCase.input as readonly Record[], getImageDir, ), + metadata: context.evalCase.metadata ?? null, trace: context.trace ?? null, tokenUsage: context.tokenUsage ?? null, costUsd: context.costUsd ?? null, diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts index bb78dd39a..fe79c525e 100644 --- a/packages/core/src/evaluation/graders/llm-grader-prompt.ts +++ b/packages/core/src/evaluation/graders/llm-grader-prompt.ts @@ -18,6 +18,44 @@ export interface LlmGraderPromptAssembly { mode: 'freeform' | 'checklist' | 'score_range'; } +function stringifyPretty(value: unknown): string { + return value === undefined ? '' : JSON.stringify(value, null, 2); +} + +function stringifyCompact(value: unknown): string { + return value === undefined ? '' : JSON.stringify(value); +} + +function buildTemplateVariables(input: { + evalCase: EvalTest; + candidate: string; + promptInputs: PromptInputs; + rubrics?: readonly RubricItem[]; + fileChanges?: string; + toolCalls?: string; +}): Record { + const formattedQuestion = + input.promptInputs.question && input.promptInputs.question.trim().length > 0 + ? input.promptInputs.question + : input.evalCase.question; + + return { + [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT]: input.candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (input.evalCase.reference_answer ?? '').trim(), + [TEMPLATE_VARIABLES.CRITERIA]: input.evalCase.criteria.trim(), + [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(input.evalCase.metadata), + [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(input.evalCase.metadata), + [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(input.rubrics), + [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(input.rubrics), + [TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? '', + [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT_TEXT]: input.candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (input.evalCase.reference_answer ?? '').trim(), + }; +} + export function assembleLlmGraderPrompt(input: { evalCase: EvalTest; candidate: string; @@ -42,6 +80,17 @@ export function assembleLlmGraderPrompt(input: { // Detect mode if (rubrics && rubrics.length > 0) { + if (graderTemplateOverride) { + return assembleCustom( + evalCase, + candidate, + promptInputs, + rubrics, + fileChanges, + toolCalls, + graderTemplateOverride, + ); + } const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0); if (hasScoreRanges) { return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls); @@ -67,23 +116,13 @@ function assembleFreeform( toolCalls?: string, graderTemplateOverride?: string, ): LlmGraderPromptAssembly { - const formattedQuestion = - promptInputs.question && promptInputs.question.trim().length > 0 - ? promptInputs.question - : evalCase.question; - - const variables = { - [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? '', - [TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? '', - // Deprecated aliases - [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? '').trim(), - }; + const variables = buildTemplateVariables({ + evalCase, + candidate, + promptInputs, + fileChanges, + toolCalls, + }); const systemPrompt = buildOutputSchema(); const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE; @@ -105,6 +144,37 @@ function assembleFreeform( }; } +function assembleCustom( + evalCase: EvalTest, + candidate: string, + promptInputs: PromptInputs, + rubrics: readonly RubricItem[], + fileChanges: string | undefined, + toolCalls: string | undefined, + graderTemplateOverride: string, +): LlmGraderPromptAssembly { + const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0); + const systemPrompt = hasScoreRanges ? buildScoreRangeOutputSchema() : buildRubricOutputSchema(); + const userPrompt = substituteVariables( + graderTemplateOverride, + buildTemplateVariables({ + evalCase, + candidate, + promptInputs, + rubrics, + fileChanges, + toolCalls, + }), + ); + + return { + systemPrompt, + userPrompt, + responseSchema: systemPrompt, + mode: hasScoreRanges ? 'score_range' : 'checklist', + }; +} + function assembleChecklist( evalCase: EvalTest, candidate: string, diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts index 3b6f58234..acdf85248 100644 --- a/packages/core/src/evaluation/graders/llm-grader.ts +++ b/packages/core/src/evaluation/graders/llm-grader.ts @@ -155,6 +155,39 @@ interface StructuredGenerationResult { readonly tokenUsage?: TokenUsage; } +function stringifyPretty(value: unknown): string { + return value === undefined ? '' : JSON.stringify(value, null, 2); +} + +function stringifyCompact(value: unknown): string { + return value === undefined ? '' : JSON.stringify(value); +} + +function buildTemplateVariables(context: EvaluationContext): Record { + const formattedQuestion = + context.promptInputs.question && context.promptInputs.question.trim().length > 0 + ? context.promptInputs.question + : context.evalCase.question; + const rubrics = context.evaluator?.type === 'llm-grader' ? context.evaluator.rubrics : undefined; + + return { + [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), + [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), + [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(context.evalCase.metadata), + [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(context.evalCase.metadata), + [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics), + [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics), + [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', + // Deprecated aliases — same values as the primary variables above + [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), + [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(), + }; +} + function resolveContentBasePath(context: EvaluationContext): string | undefined { if (context.workspacePath) { return context.workspacePath; @@ -259,25 +292,7 @@ export class LlmGrader implements Grader { context: EvaluationContext, graderProvider: Provider, ): Promise { - const formattedQuestion = - context.promptInputs.question && context.promptInputs.question.trim().length > 0 - ? context.promptInputs.question - : context.evalCase.question; - - // Prepare template variables for substitution. - // Primary variables resolve to human-readable text; deprecated _text aliases map to the same values. - const variables = { - [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', - [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', - // Deprecated aliases — same values as the primary variables above - [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(), - }; + const variables = buildTemplateVariables(context); // Build system prompt (only the mandatory output schema) const systemPrompt = buildOutputSchema(); @@ -367,7 +382,10 @@ export class LlmGrader implements Grader { return this.evaluateWithScoreRanges(context, graderProvider, rubrics); } - const prompt = this.buildRubricPrompt(context, rubrics); + const prompt = + context.graderTemplateOverride || this.graderTemplate + ? this.buildCustomPrompt(context) + : this.buildRubricPrompt(context, rubrics); const systemPrompt = buildRubricOutputSchema(); const graderRawRequest: JsonObject = { @@ -423,7 +441,10 @@ export class LlmGrader implements Grader { graderProvider: Provider, rubrics: readonly RubricItem[], ): Promise { - const prompt = this.buildScoreRangePrompt(context, rubrics); + const prompt = + context.graderTemplateOverride || this.graderTemplate + ? this.buildCustomPrompt(context) + : this.buildScoreRangePrompt(context, rubrics); const systemPrompt = buildScoreRangeOutputSchema(); const graderRawRequest: JsonObject = { @@ -688,22 +709,12 @@ export class LlmGrader implements Grader { ? context.promptInputs.question : context.evalCase.question; - const variables: Record = { - [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', - [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', - // Deprecated aliases - [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(), - }; + const variables = buildTemplateVariables(context); - if (this.graderTemplate) { - warnDeprecatedTemplateVars(this.graderTemplate); - return substituteVariables(this.graderTemplate, variables); + const template = context.graderTemplateOverride ?? this.graderTemplate; + if (template) { + warnDeprecatedTemplateVars(template); + return substituteVariables(template, variables); } const config = context.evaluator; @@ -767,21 +778,11 @@ export class LlmGrader implements Grader { const config = context.evaluator; const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined; - if (this.graderTemplate) { - const variables: Record = { - [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', - [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', - // Deprecated aliases - [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), - [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(), - }; - warnDeprecatedTemplateVars(this.graderTemplate); - const customPrompt = substituteVariables(this.graderTemplate, variables); + const template = context.graderTemplateOverride ?? this.graderTemplate; + if (template) { + const variables = buildTemplateVariables(context); + warnDeprecatedTemplateVars(template); + const customPrompt = substituteVariables(template, variables); const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema(); @@ -984,6 +985,12 @@ export class LlmGrader implements Grader { return parts.join('\n'); } + private buildCustomPrompt(context: EvaluationContext): string { + const template = context.graderTemplateOverride ?? this.graderTemplate ?? ''; + warnDeprecatedTemplateVars(template); + return substituteVariables(template, buildTemplateVariables(context)); + } + private buildRubricPrompt(context: EvaluationContext, rubrics: readonly RubricItem[]): string { const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 diff --git a/packages/core/src/evaluation/graders/prompt-resolution.ts b/packages/core/src/evaluation/graders/prompt-resolution.ts index 4e9bf7a5a..b31717047 100644 --- a/packages/core/src/evaluation/graders/prompt-resolution.ts +++ b/packages/core/src/evaluation/graders/prompt-resolution.ts @@ -103,6 +103,7 @@ async function executePromptTemplate( output: context.output ?? null, inputFiles: context.evalCase.file_paths, input: context.evalCase.input, + metadata: context.evalCase.metadata ?? null, trace: context.trace ?? null, fileChanges: context.fileChanges ?? null, workspacePath: context.workspacePath ?? null, diff --git a/packages/core/src/evaluation/loaders/grader-parser.ts b/packages/core/src/evaluation/loaders/grader-parser.ts index 92ba2832b..7fa75aea9 100644 --- a/packages/core/src/evaluation/loaders/grader-parser.ts +++ b/packages/core/src/evaluation/loaders/grader-parser.ts @@ -2143,7 +2143,7 @@ function parseRubricItems( } const id = asString(rawRubric.id) ?? `rubric-${index + 1}`; - const expectedOutcome = asString(rawRubric.outcome) ?? ''; + const expectedOutcome = asString(rawRubric.outcome) ?? asString(rawRubric.criteria) ?? ''; const operator = parseRubricOperator(rawRubric.operator, id, evaluatorName, evalId); const weight = typeof rawRubric.weight === 'number' ? rawRubric.weight : 1.0; diff --git a/packages/core/src/evaluation/loaders/shorthand-expansion.ts b/packages/core/src/evaluation/loaders/shorthand-expansion.ts index bcc4e37f0..a8fbd9008 100644 --- a/packages/core/src/evaluation/loaders/shorthand-expansion.ts +++ b/packages/core/src/evaluation/loaders/shorthand-expansion.ts @@ -15,6 +15,7 @@ import { isJsonObject, isTestMessage } from '../types.js'; * * Supports: * - String: "What is 2+2?" -> [{ role: 'user', content: "What is 2+2?" }] + * - Object (without role key): { accuracy: 0.9 } -> [{ role: 'user', content: { accuracy: 0.9 } }] * - Array of messages: Already in message format, passthrough * * @param value The raw `input` value from YAML/JSONL @@ -30,6 +31,15 @@ export function expandInputShorthand(value: JsonValue | undefined): TestMessage[ return [{ role: 'user', content: value }]; } + // Object shorthand: single user message with structured content. + // If it already looks like a message, preserve the existing message shape. + if (isJsonObject(value)) { + if ('role' in value) { + return isTestMessage(value) ? [value] : undefined; + } + return [{ role: 'user', content: value }]; + } + // Array: should be message array if (Array.isArray(value)) { const messages = value.filter((msg): msg is TestMessage => isTestMessage(msg)); diff --git a/packages/core/src/evaluation/template-variables.ts b/packages/core/src/evaluation/template-variables.ts index 9d92f0d87..81e13c36a 100644 --- a/packages/core/src/evaluation/template-variables.ts +++ b/packages/core/src/evaluation/template-variables.ts @@ -7,6 +7,10 @@ * - {{ output }} — last assistant message as plain text * - {{ expected_output }} — reference answer as plain text * - {{ criteria }} — evaluation criteria string + * - {{ metadata }} — per-test metadata as formatted JSON + * - {{ metadata_json }} — per-test metadata as compact JSON + * - {{ rubrics }} — llm-grader rubrics as formatted JSON + * - {{ rubrics_json }} — llm-grader rubrics as compact JSON * - {{ file_changes }} — file diff (if available) * - {{ tool_calls }} — formatted summary of tool calls from agent execution * @@ -18,6 +22,10 @@ export const TEMPLATE_VARIABLES = { EXPECTED_OUTPUT: 'expected_output', CRITERIA: 'criteria', + METADATA: 'metadata', + METADATA_JSON: 'metadata_json', + RUBRICS: 'rubrics', + RUBRICS_JSON: 'rubrics_json', INPUT: 'input', OUTPUT: 'output', FILE_CHANGES: 'file_changes', diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 8bf48d414..c82592992 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -119,6 +119,8 @@ type RawTestSuite = JsonObject & { /** @deprecated Use `assertions` instead */ readonly assert?: JsonValue; readonly input?: JsonValue; + readonly metadata?: JsonValue; + readonly governance?: JsonValue; /** Shorthand: list of file paths to prepend as type:file content blocks in each test's user message. */ readonly input_files?: JsonValue; // Suite-level metadata fields @@ -431,9 +433,9 @@ async function loadTestsFromYaml( const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir); - // Suite-level governance block (top-level `governance:` wins over `metadata.governance:`). - // Merged into each case's `metadata.governance` via mergeSuiteMetadataPayload. - const suiteGovernance = extractSuiteGovernance(suite); + // Suite-level metadata defaults. Top-level `metadata:` is inherited by each case. + // Top-level `governance:` wins over `metadata.governance:` for compatibility. + const suiteMetadataPayload = extractSuiteMetadataPayload(suite); const rawSuiteInput = suite.input; const rawSuiteInputFiles = suite.input_files; @@ -631,10 +633,7 @@ async function loadTestsFromYaml( const rawCaseMetadata = isJsonObject(renderedCase.metadata) ? (renderedCase.metadata as Record) : undefined; - const suitePayload = - suiteGovernance !== undefined ? { governance: suiteGovernance } : undefined; - const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suitePayload); - + const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload); // Extract per-test targets override (matrix evaluation) const caseTargets = extractTargetsFromTestCase(renderedCase as JsonObject); @@ -1328,23 +1327,26 @@ function asString(value: unknown): string | undefined { } /** - * Pull the optional `governance` block out of a suite YAML. Top-level `governance:` wins - * over the nested `metadata.governance:` form so that authors who already use top-level - * suite metadata fields (`name`, `description`, `tags`) can keep their existing layout. + * Build metadata defaults inherited by each test case. Top-level `metadata:` carries + * arbitrary domain/source fields; top-level `governance:` wins over nested + * `metadata.governance:` so existing governance evals keep their precedence. */ -function extractSuiteGovernance(suite: RawTestSuite): Record | undefined { +function extractSuiteMetadataPayload(suite: RawTestSuite): Record | undefined { + const payload = isJsonObject(suite.metadata) + ? ({ ...(suite.metadata as Record) } as Record) + : {}; + const top = (suite as JsonObject).governance; if (isJsonObject(top)) { - return top as Record; - } - const wrapper = (suite as JsonObject).metadata; - if (isJsonObject(wrapper)) { - const nested = (wrapper as JsonObject).governance; + payload.governance = top as Record; + } else { + const nested = payload.governance; if (isJsonObject(nested)) { - return nested as Record; + payload.governance = nested as Record; } } - return undefined; + + return Object.keys(payload).length > 0 ? payload : undefined; } /** diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts index dbd925480..835084494 100644 --- a/packages/core/test/evaluation/evaluators_variables.test.ts +++ b/packages/core/test/evaluation/evaluators_variables.test.ts @@ -96,6 +96,65 @@ File Changes: {{file_changes}} expect(request?.systemPrompt).not.toContain(`Question: ${formattedQuestion}`); }); + it('substitutes structured input, metadata, and rubrics variables', async () => { + const customPrompt = ` +Metadata: {{metadata_json}} +Input: {{input}} +Rubrics: {{rubrics_json}} +Candidate: {{output}} +`; + + const graderProvider = new CapturingProvider({ + output: [ + { + role: 'assistant', + content: JSON.stringify({ + checks: [{ id: 'factual', satisfied: true, reasoning: 'Matches' }], + overall_reasoning: 'OK', + }), + }, + ], + }); + + const evaluator = new LlmGrader({ + resolveGraderProvider: async () => graderProvider, + graderTemplate: customPrompt, + }); + + await evaluator.evaluate({ + evalCase: { + ...baseTestCase, + input: [{ role: 'user', content: { company: 'Apple', ticker: 'AAPL' } }], + metadata: { source_repo: 'https://github.com/virattt/dexter' }, + }, + candidate: 'Apple revenue increased.', + target: baseTarget, + provider: graderProvider, + attempt: 0, + promptInputs: { question: '{\n "company": "Apple",\n "ticker": "AAPL"\n}' }, + now: new Date(), + evaluator: { + name: 'dexter', + type: 'llm-grader', + rubrics: [ + { + id: 'factual', + operator: 'correctness', + outcome: 'Uses the supplied ticker', + weight: 1, + required: true, + }, + ], + }, + }); + + const prompt = graderProvider.lastRequest?.question ?? ''; + expect(prompt).toContain('"source_repo":"https://github.com/virattt/dexter"'); + expect(prompt).toContain('"ticker": "AAPL"'); + expect(prompt).toContain('"operator":"correctness"'); + expect(prompt).toContain('Candidate: Apple revenue increased.'); + }); + it('deprecated _text aliases still resolve correctly', async () => { const formattedQuestion = 'What is 2+2?'; const customPrompt = ` diff --git a/packages/core/test/evaluation/graders/prompt-resolution.test.ts b/packages/core/test/evaluation/graders/prompt-resolution.test.ts index 02ef7d946..1c17cec2f 100644 --- a/packages/core/test/evaluation/graders/prompt-resolution.test.ts +++ b/packages/core/test/evaluation/graders/prompt-resolution.test.ts @@ -26,6 +26,14 @@ describe('containsTemplateVariables', () => { expect(containsTemplateVariables('Review {{file_changes}}')).toBe(true); }); + it('returns true for structured template variables', () => { + expect( + containsTemplateVariables( + 'Review {{metadata_json}} and {{rubrics_json}} against {{input}} and {{output}}', + ), + ).toBe(true); + }); + it('returns true for deprecated {{output_text}} variable', () => { expect(containsTemplateVariables('Grade the {{output_text}}')).toBe(true); }); diff --git a/packages/core/test/evaluation/yaml-parser-metadata.test.ts b/packages/core/test/evaluation/yaml-parser-metadata.test.ts index 12dea2bde..0d46e7867 100644 --- a/packages/core/test/evaluation/yaml-parser-metadata.test.ts +++ b/packages/core/test/evaluation/yaml-parser-metadata.test.ts @@ -173,4 +173,59 @@ tests: owasp_llm_top_10_2025: ['LLM01'], }); }); + + it('merges arbitrary suite metadata into each case and lets case scalars override', async () => { + const { filePath, dir } = createTempYaml(` +metadata: + source_repo: https://github.com/virattt/dexter + source_commit: 8d9419829f443f84b804d033bb2c3b1fbd788629 + source_file: src/evals/dataset/finance_agent.csv + tags: [suite] +tests: + - id: case-1 + criteria: "Answer" + input: "Query" + metadata: + source_file: override.csv + tags: [case] +`); + + const suite = await loadTestSuite(filePath, dir); + expect(suite.tests[0].metadata).toMatchObject({ + source_repo: 'https://github.com/virattt/dexter', + source_commit: '8d9419829f443f84b804d033bb2c3b1fbd788629', + source_file: 'override.csv', + tags: ['suite', 'case'], + }); + }); + + it('loads structured input objects and rubric criteria aliases', async () => { + const { filePath, dir } = createTempYaml(` +tests: + - id: case-1 + input: + company: Apple + ticker: AAPL + assertions: + - name: dexter_rubric + type: llm-grader + rubrics: + - id: factual + operator: correctness + criteria: "Uses the supplied company and ticker" +`); + + const suite = await loadTestSuite(filePath, dir); + expect(suite.tests[0].input[0].content).toEqual({ company: 'Apple', ticker: 'AAPL' }); + expect(suite.tests[0].question).toContain('"ticker": "AAPL"'); + const grader = suite.tests[0].assertions?.[0]; + expect(grader?.type).toBe('llm-grader'); + if (grader?.type === 'llm-grader') { + expect(grader.rubrics?.[0]).toMatchObject({ + id: 'factual', + operator: 'correctness', + outcome: 'Uses the supplied company and ticker', + }); + } + }); }); diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index 374ca6651..4d2eb340e 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -276,6 +276,7 @@ export const CodeGraderInputSchema = z.object({ outputPath: z.string().optional(), inputFiles: z.array(z.string()), input: z.array(MessageSchema), + metadata: z.record(z.unknown()).nullable().optional(), trace: TraceSummarySchema.nullable().optional(), tokenUsage: TokenUsageSchema.nullable().optional(), costUsd: z.number().nullable().optional(),