From e9888b38c94b17f3653f8450a46fc8da39897dd9 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 10 Jun 2026 04:54:53 +0200
Subject: [PATCH 1/3] feat(core): add structured llm grader inputs

---
 .../content/docs/docs/graders/llm-graders.mdx |  51 ++++++--
 .../src/evaluation/graders/code-grader.ts     |   2 +
 .../evaluation/graders/llm-grader-prompt.ts   | 106 ++++++++++++++---
 .../core/src/evaluation/graders/llm-grader.ts | 111 ++++++++++--------
 .../evaluation/graders/prompt-resolution.ts   |   2 +
 .../src/evaluation/loaders/grader-parser.ts   |   2 +-
 .../core/src/evaluation/template-variables.ts |  12 ++
 packages/core/src/evaluation/types.ts         |   2 +
 packages/core/src/evaluation/yaml-parser.ts   |  44 ++++---
 .../evaluation/evaluators_variables.test.ts   |  59 ++++++++++
 .../graders/prompt-resolution.test.ts         |   8 ++
 .../evaluation/yaml-parser-metadata.test.ts   |  55 +++++++++
 packages/eval/src/schemas.ts                  |   2 +
 13 files changed, 362 insertions(+), 94 deletions(-)

diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
index 14f88ba6a..cc40ab2c1 100644
--- a/apps/web/src/content/docs/docs/graders/llm-graders.mdx
+++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
@@ -29,7 +29,7 @@ Reference an LLM grader in your eval file:
 assertions:
   - name: semantic_check
     type: llm-grader
-    prompt: ./graders/correctness.md
+    prompt: file://graders/correctness.md
     target: grader_gpt_5_mini   # optional: route this grader to a named LLM target
 ```
 
@@ -69,12 +69,45 @@ Score the response from 0.0 to 1.0 based on:
 | `output_text` | Last candidate response content |
 | `expected_output_text` | Last expected message content |
 | `criteria` | Test `criteria` field |
-| `input` | Full resolved input array, JSON-serialized |
-| `expected_output` | Full resolved expected array, JSON-serialized |
-| `output` | Full provider output array, JSON-serialized |
+| `input` | Resolved input text |
+| `expected_output` | Reference answer text |
+| `output` | Candidate answer text |
+| `metadata` | Test metadata as formatted JSON |
+| `metadata_json` | Test metadata as compact JSON |
+| `input_object` | Test `input_object` as formatted JSON |
+| `input_object_json` | Test `input_object` as compact JSON |
+| `rubrics` | LLM-grader rubric items as formatted JSON |
+| `rubrics_json` | LLM-grader rubric items as compact JSON |
 | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
 | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) |
 
+Use `prompt: file://path/to/prompt.md` to reuse a markdown prompt file. Bare `prompt: "..."` strings are treated as inline prompt text, not file paths.
+
+Suite-level `metadata` is inherited by every test. When rubric items vary per test, keep the grader on each test and reuse the prompt file:
+
+```yaml
+metadata:
+  source_repo: https://github.com/virattt/dexter
+  source_commit: 8d9419829f443f84b804d033bb2c3b1fbd788629
+  source_file: src/evals/dataset/finance_agent.csv
+
+tests:
+  - id: apple-research
+    input: Research Apple
+    input_object:
+      company: Apple
+      ticker: AAPL
+    metadata:
+      row: 1
+    assertions:
+      - name: dexter_semantic
+        type: llm-grader
+        prompt: file://prompts/dexter-grader.md
+        rubrics:
+          - operator: correctness
+            criteria: Uses the provided ticker and company.
+```
+
 ## Per-Grader Target
 
 By default, an `llm-grader` uses the suite target's `grader_target`. Override it per grader when you need multiple grader models in one run:
@@ -193,6 +226,8 @@ TypeScript templates receive a context object with these fields:
 | `expectedOutput` | `Message[]` | Full resolved expected output |
 | `output` | `Message[]` | Full provider output messages |
 | `trace` | `TraceSummary` | Execution metrics summary |
+| `inputObject` | `unknown` | Optional structured `input_object` payload |
+| `metadata` | `object` | Test metadata after suite defaults are merged |
 | `config` | `object` | Custom config from YAML |
 
 ## Template Variable Derivation
@@ -225,9 +260,11 @@ Derived strings injected into grader prompts:
 | `criteria` | Passed through from the test field |
 | `expected_output_text` | Content of the last entry in `expected_output` |
 | `output_text` | Content of the last entry in `output` |
-| `input` | Full resolved input array, JSON-serialized |
-| `expected_output` | Full resolved expected array, JSON-serialized |
-| `output` | Full provider output array, JSON-serialized |
+| `input` | Resolved input text |
+| `expected_output` | Reference answer text |
+| `output` | Candidate answer text |
+| `metadata_json` | Test metadata, compact JSON |
+| `input_object_json` | Structured test input object, compact JSON |
 | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
 | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) |
 
diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts
index b672ab32d..1aca3abd6 100644
--- a/packages/core/src/evaluation/graders/code-grader.ts
+++ b/packages/core/src/evaluation/graders/code-grader.ts
@@ -168,6 +168,8 @@ export class CodeGrader implements Grader {
         context.evalCase.input as readonly Record<string, unknown>[],
         getImageDir,
       ),
+      inputObject: context.evalCase.inputObject ?? null,
+      metadata: context.evalCase.metadata ?? null,
       trace: context.trace ?? null,
       tokenUsage: context.tokenUsage ?? null,
       costUsd: context.costUsd ?? null,
diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts
index bb78dd39a..fc12ff0ef 100644
--- a/packages/core/src/evaluation/graders/llm-grader-prompt.ts
+++ b/packages/core/src/evaluation/graders/llm-grader-prompt.ts
@@ -18,6 +18,46 @@ export interface LlmGraderPromptAssembly {
   mode: 'freeform' | 'checklist' | 'score_range';
 }
 
+function stringifyPretty(value: unknown): string {
+  return value === undefined ? '' : JSON.stringify(value, null, 2);
+}
+
+function stringifyCompact(value: unknown): string {
+  return value === undefined ? '' : JSON.stringify(value);
+}
+
+function buildTemplateVariables(input: {
+  evalCase: EvalTest;
+  candidate: string;
+  promptInputs: PromptInputs;
+  rubrics?: readonly RubricItem[];
+  fileChanges?: string;
+  toolCalls?: string;
+}): Record<string, string> {
+  const formattedQuestion =
+    input.promptInputs.question && input.promptInputs.question.trim().length > 0
+      ? input.promptInputs.question
+      : input.evalCase.question;
+
+  return {
+    [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+    [TEMPLATE_VARIABLES.OUTPUT]: input.candidate.trim(),
+    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (input.evalCase.reference_answer ?? '').trim(),
+    [TEMPLATE_VARIABLES.CRITERIA]: input.evalCase.criteria.trim(),
+    [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(input.evalCase.metadata),
+    [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(input.evalCase.metadata),
+    [TEMPLATE_VARIABLES.INPUT_OBJECT]: stringifyPretty(input.evalCase.inputObject),
+    [TEMPLATE_VARIABLES.INPUT_OBJECT_JSON]: stringifyCompact(input.evalCase.inputObject),
+    [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(input.rubrics),
+    [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(input.rubrics),
+    [TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? '',
+    [TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? '',
+    [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
+    [TEMPLATE_VARIABLES.OUTPUT_TEXT]: input.candidate.trim(),
+    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (input.evalCase.reference_answer ?? '').trim(),
+  };
+}
+
 export function assembleLlmGraderPrompt(input: {
   evalCase: EvalTest;
   candidate: string;
@@ -42,6 +82,17 @@ export function assembleLlmGraderPrompt(input: {
 
   // Detect mode
   if (rubrics && rubrics.length > 0) {
+    if (graderTemplateOverride) {
+      return assembleCustom(
+        evalCase,
+        candidate,
+        promptInputs,
+        rubrics,
+        fileChanges,
+        toolCalls,
+        graderTemplateOverride,
+      );
+    }
     const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
     if (hasScoreRanges) {
       return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
@@ -67,23 +118,13 @@ function assembleFreeform(
   toolCalls?: string,
   graderTemplateOverride?: string,
 ): LlmGraderPromptAssembly {
-  const formattedQuestion =
-    promptInputs.question && promptInputs.question.trim().length > 0
-      ? promptInputs.question
-      : evalCase.question;
-
-  const variables = {
-    [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
-    [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
-    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? '').trim(),
-    [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
-    [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? '',
-    [TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? '',
-    // Deprecated aliases
-    [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
-    [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
-    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? '').trim(),
-  };
+  const variables = buildTemplateVariables({
+    evalCase,
+    candidate,
+    promptInputs,
+    fileChanges,
+    toolCalls,
+  });
 
   const systemPrompt = buildOutputSchema();
   const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
@@ -105,6 +146,37 @@ function assembleFreeform(
   };
 }
 
+function assembleCustom(
+  evalCase: EvalTest,
+  candidate: string,
+  promptInputs: PromptInputs,
+  rubrics: readonly RubricItem[],
+  fileChanges: string | undefined,
+  toolCalls: string | undefined,
+  graderTemplateOverride: string,
+): LlmGraderPromptAssembly {
+  const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
+  const systemPrompt = hasScoreRanges ? buildScoreRangeOutputSchema() : buildRubricOutputSchema();
+  const userPrompt = substituteVariables(
+    graderTemplateOverride,
+    buildTemplateVariables({
+      evalCase,
+      candidate,
+      promptInputs,
+      rubrics,
+      fileChanges,
+      toolCalls,
+    }),
+  );
+
+  return {
+    systemPrompt,
+    userPrompt,
+    responseSchema: systemPrompt,
+    mode: hasScoreRanges ? 'score_range' : 'checklist',
+  };
+}
+
 function assembleChecklist(
   evalCase: EvalTest,
   candidate: string,
diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts
index 3b6f58234..5092da81f 100644
--- a/packages/core/src/evaluation/graders/llm-grader.ts
+++ b/packages/core/src/evaluation/graders/llm-grader.ts
@@ -155,6 +155,41 @@ interface StructuredGenerationResult {
   readonly tokenUsage?: TokenUsage;
 }
 
+function stringifyPretty(value: unknown): string {
+  return value === undefined ? '' : JSON.stringify(value, null, 2);
+}
+
+function stringifyCompact(value: unknown): string {
+  return value === undefined ? '' : JSON.stringify(value);
+}
+
+function buildTemplateVariables(context: EvaluationContext): Record<string, string> {
+  const formattedQuestion =
+    context.promptInputs.question && context.promptInputs.question.trim().length > 0
+      ? context.promptInputs.question
+      : context.evalCase.question;
+  const rubrics = context.evaluator?.type === 'llm-grader' ? context.evaluator.rubrics : undefined;
+
+  return {
+    [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+    [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
+    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(),
+    [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
+    [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(context.evalCase.metadata),
+    [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(context.evalCase.metadata),
+    [TEMPLATE_VARIABLES.INPUT_OBJECT]: stringifyPretty(context.evalCase.inputObject),
+    [TEMPLATE_VARIABLES.INPUT_OBJECT_JSON]: stringifyCompact(context.evalCase.inputObject),
+    [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics),
+    [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics),
+    [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
+    [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '',
+    // Deprecated aliases — same values as the primary variables above
+    [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
+    [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
+    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(),
+  };
+}
+
 function resolveContentBasePath(context: EvaluationContext): string | undefined {
   if (context.workspacePath) {
     return context.workspacePath;
@@ -259,25 +294,7 @@ export class LlmGrader implements Grader {
     context: EvaluationContext,
     graderProvider: Provider,
   ): Promise<EvaluationScore> {
-    const formattedQuestion =
-      context.promptInputs.question && context.promptInputs.question.trim().length > 0
-        ? context.promptInputs.question
-        : context.evalCase.question;
-
-    // Prepare template variables for substitution.
-    // Primary variables resolve to human-readable text; deprecated _text aliases map to the same values.
-    const variables = {
-      [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
-      [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
-      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(),
-      [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
-      [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
-      [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '',
-      // Deprecated aliases — same values as the primary variables above
-      [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
-      [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
-      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(),
-    };
+    const variables = buildTemplateVariables(context);
 
     // Build system prompt (only the mandatory output schema)
     const systemPrompt = buildOutputSchema();
@@ -367,7 +384,10 @@ export class LlmGrader implements Grader {
       return this.evaluateWithScoreRanges(context, graderProvider, rubrics);
     }
 
-    const prompt = this.buildRubricPrompt(context, rubrics);
+    const prompt =
+      context.graderTemplateOverride || this.graderTemplate
+        ? this.buildCustomPrompt(context)
+        : this.buildRubricPrompt(context, rubrics);
     const systemPrompt = buildRubricOutputSchema();
 
     const graderRawRequest: JsonObject = {
@@ -423,7 +443,10 @@ export class LlmGrader implements Grader {
     graderProvider: Provider,
     rubrics: readonly RubricItem[],
   ): Promise<EvaluationScore> {
-    const prompt = this.buildScoreRangePrompt(context, rubrics);
+    const prompt =
+      context.graderTemplateOverride || this.graderTemplate
+        ? this.buildCustomPrompt(context)
+        : this.buildScoreRangePrompt(context, rubrics);
     const systemPrompt = buildScoreRangeOutputSchema();
 
     const graderRawRequest: JsonObject = {
@@ -688,22 +711,12 @@ export class LlmGrader implements Grader {
         ? context.promptInputs.question
         : context.evalCase.question;
 
-    const variables: Record<string, string> = {
-      [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
-      [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
-      [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
-      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(),
-      [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
-      [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '',
-      // Deprecated aliases
-      [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
-      [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
-      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(),
-    };
+    const variables = buildTemplateVariables(context);
 
-    if (this.graderTemplate) {
-      warnDeprecatedTemplateVars(this.graderTemplate);
-      return substituteVariables(this.graderTemplate, variables);
+    const template = context.graderTemplateOverride ?? this.graderTemplate;
+    if (template) {
+      warnDeprecatedTemplateVars(template);
+      return substituteVariables(template, variables);
     }
 
     const config = context.evaluator;
@@ -767,21 +780,11 @@ export class LlmGrader implements Grader {
     const config = context.evaluator;
     const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined;
 
-    if (this.graderTemplate) {
-      const variables: Record<string, string> = {
-        [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
-        [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
-        [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
-        [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(),
-        [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
-        [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '',
-        // Deprecated aliases
-        [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
-        [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
-        [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(),
-      };
-      warnDeprecatedTemplateVars(this.graderTemplate);
-      const customPrompt = substituteVariables(this.graderTemplate, variables);
+    const template = context.graderTemplateOverride ?? this.graderTemplate;
+    if (template) {
+      const variables = buildTemplateVariables(context);
+      warnDeprecatedTemplateVars(template);
+      const customPrompt = substituteVariables(template, variables);
 
       const outputSchema =
         rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
@@ -984,6 +987,12 @@ export class LlmGrader implements Grader {
     return parts.join('\n');
   }
 
+  private buildCustomPrompt(context: EvaluationContext): string {
+    const template = context.graderTemplateOverride ?? this.graderTemplate ?? '';
+    warnDeprecatedTemplateVars(template);
+    return substituteVariables(template, buildTemplateVariables(context));
+  }
+
   private buildRubricPrompt(context: EvaluationContext, rubrics: readonly RubricItem[]): string {
     const formattedQuestion =
       context.promptInputs.question && context.promptInputs.question.trim().length > 0
diff --git a/packages/core/src/evaluation/graders/prompt-resolution.ts b/packages/core/src/evaluation/graders/prompt-resolution.ts
index 4e9bf7a5a..0e90fd47f 100644
--- a/packages/core/src/evaluation/graders/prompt-resolution.ts
+++ b/packages/core/src/evaluation/graders/prompt-resolution.ts
@@ -103,6 +103,8 @@ async function executePromptTemplate(
     output: context.output ?? null,
     inputFiles: context.evalCase.file_paths,
     input: context.evalCase.input,
+    inputObject: context.evalCase.inputObject ?? null,
+    metadata: context.evalCase.metadata ?? null,
     trace: context.trace ?? null,
     fileChanges: context.fileChanges ?? null,
     workspacePath: context.workspacePath ?? null,
diff --git a/packages/core/src/evaluation/loaders/grader-parser.ts b/packages/core/src/evaluation/loaders/grader-parser.ts
index 92ba2832b..7fa75aea9 100644
--- a/packages/core/src/evaluation/loaders/grader-parser.ts
+++ b/packages/core/src/evaluation/loaders/grader-parser.ts
@@ -2143,7 +2143,7 @@ function parseRubricItems(
     }
 
     const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
-    const expectedOutcome = asString(rawRubric.outcome) ?? '';
+    const expectedOutcome = asString(rawRubric.outcome) ?? asString(rawRubric.criteria) ?? '';
     const operator = parseRubricOperator(rawRubric.operator, id, evaluatorName, evalId);
     const weight = typeof rawRubric.weight === 'number' ? rawRubric.weight : 1.0;
 
diff --git a/packages/core/src/evaluation/template-variables.ts b/packages/core/src/evaluation/template-variables.ts
index 9d92f0d87..0fd268e09 100644
--- a/packages/core/src/evaluation/template-variables.ts
+++ b/packages/core/src/evaluation/template-variables.ts
@@ -7,6 +7,12 @@
  *   - {{ output }}          — last assistant message as plain text
  *   - {{ expected_output }} — reference answer as plain text
  *   - {{ criteria }}        — evaluation criteria string
+ *   - {{ metadata }}        — per-test metadata as formatted JSON
+ *   - {{ metadata_json }}   — per-test metadata as compact JSON
+ *   - {{ input_object }}      — per-test structured input object as formatted JSON
+ *   - {{ input_object_json }} — per-test structured input object as compact JSON
+ *   - {{ rubrics }}        — llm-grader rubrics as formatted JSON
+ *   - {{ rubrics_json }}   — llm-grader rubrics as compact JSON
  *   - {{ file_changes }}    — file diff (if available)
  *   - {{ tool_calls }}     — formatted summary of tool calls from agent execution
  *
@@ -18,6 +24,12 @@
 export const TEMPLATE_VARIABLES = {
   EXPECTED_OUTPUT: 'expected_output',
   CRITERIA: 'criteria',
+  METADATA: 'metadata',
+  METADATA_JSON: 'metadata_json',
+  INPUT_OBJECT: 'input_object',
+  INPUT_OBJECT_JSON: 'input_object_json',
+  RUBRICS: 'rubrics',
+  RUBRICS_JSON: 'rubrics_json',
   INPUT: 'input',
   OUTPUT: 'output',
   FILE_CHANGES: 'file_changes',
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index 7e764ff8b..1b31f5bfc 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -1009,6 +1009,8 @@ export interface EvalTest {
   readonly conversation_id?: string;
   readonly question: string;
   readonly input: readonly TestMessage[];
+  /** Optional structured per-case input payload for grader prompt templates. */
+  readonly inputObject?: JsonValue;
   readonly expected_output: readonly JsonObject[];
   readonly reference_answer?: string;
   readonly file_paths: readonly string[];
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index 8bf48d414..35c9c34e9 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -64,7 +64,7 @@ import type {
   WorkspaceHooksConfig,
   WorkspaceScriptConfig,
 } from './types.js';
-import { isJsonObject, isTestMessage } from './types.js';
+import { isJsonObject, isJsonValue, isTestMessage } from './types.js';
 import { parseRepoConfig } from './workspace/repo-config-parser.js';
 import { parseYamlValue } from './yaml-loader.js';
 
@@ -119,6 +119,8 @@ type RawTestSuite = JsonObject & {
   /** @deprecated Use `assertions` instead */
   readonly assert?: JsonValue;
   readonly input?: JsonValue;
+  readonly metadata?: JsonValue;
+  readonly governance?: JsonValue;
   /** Shorthand: list of file paths to prepend as type:file content blocks in each test's user message. */
   readonly input_files?: JsonValue;
   // Suite-level metadata fields
@@ -140,6 +142,7 @@ type RawEvalCase = JsonObject & {
   /** @deprecated Use `criteria` instead */
   readonly expected_outcome?: JsonValue;
   readonly input?: JsonValue;
+  readonly input_object?: JsonValue;
   /** Shorthand: list of file paths to prepend as type:file content blocks in the user message. */
   readonly input_files?: JsonValue;
   readonly expected_output?: JsonValue;
@@ -431,9 +434,9 @@ async function loadTestsFromYaml(
 
   const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
 
-  // Suite-level governance block (top-level `governance:` wins over `metadata.governance:`).
-  // Merged into each case's `metadata.governance` via mergeSuiteMetadataPayload.
-  const suiteGovernance = extractSuiteGovernance(suite);
+  // Suite-level metadata defaults. Top-level `metadata:` is inherited by each case.
+  // Top-level `governance:` wins over `metadata.governance:` for compatibility.
+  const suiteMetadataPayload = extractSuiteMetadataPayload(suite);
 
   const rawSuiteInput = suite.input;
   const rawSuiteInputFiles = suite.input_files;
@@ -631,9 +634,10 @@ async function loadTestsFromYaml(
     const rawCaseMetadata = isJsonObject(renderedCase.metadata)
       ? (renderedCase.metadata as Record<string, unknown>)
       : undefined;
-    const suitePayload =
-      suiteGovernance !== undefined ? { governance: suiteGovernance } : undefined;
-    const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suitePayload);
+    const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload);
+    const inputObject = isJsonValue(renderedCase.input_object)
+      ? renderedCase.input_object
+      : undefined;
 
     // Extract per-test targets override (matrix evaluation)
     const caseTargets = extractTargetsFromTestCase(renderedCase as JsonObject);
@@ -679,6 +683,7 @@ async function loadTestsFromYaml(
       conversation_id: conversationId,
       question: question,
       input: inputMessages,
+      ...(inputObject !== undefined ? { inputObject } : {}),
       expected_output: outputSegments,
       reference_answer: referenceAnswer,
       file_paths: userFilePaths,
@@ -1328,23 +1333,26 @@ function asString(value: unknown): string | undefined {
 }
 
 /**
- * Pull the optional `governance` block out of a suite YAML. Top-level `governance:` wins
- * over the nested `metadata.governance:` form so that authors who already use top-level
- * suite metadata fields (`name`, `description`, `tags`) can keep their existing layout.
+ * Build metadata defaults inherited by each test case. Top-level `metadata:` carries
+ * arbitrary domain/source fields; top-level `governance:` wins over nested
+ * `metadata.governance:` so existing governance evals keep their precedence.
  */
-function extractSuiteGovernance(suite: RawTestSuite): Record<string, unknown> | undefined {
+function extractSuiteMetadataPayload(suite: RawTestSuite): Record<string, unknown> | undefined {
+  const payload = isJsonObject(suite.metadata)
+    ? ({ ...(suite.metadata as Record<string, unknown>) } as Record<string, unknown>)
+    : {};
+
   const top = (suite as JsonObject).governance;
   if (isJsonObject(top)) {
-    return top as Record<string, unknown>;
-  }
-  const wrapper = (suite as JsonObject).metadata;
-  if (isJsonObject(wrapper)) {
-    const nested = (wrapper as JsonObject).governance;
+    payload.governance = top as Record<string, unknown>;
+  } else {
+    const nested = payload.governance;
     if (isJsonObject(nested)) {
-      return nested as Record<string, unknown>;
+      payload.governance = nested as Record<string, unknown>;
     }
   }
-  return undefined;
+
+  return Object.keys(payload).length > 0 ? payload : undefined;
 }
 
 /**
diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts
index dbd925480..008e8d80f 100644
--- a/packages/core/test/evaluation/evaluators_variables.test.ts
+++ b/packages/core/test/evaluation/evaluators_variables.test.ts
@@ -96,6 +96,65 @@ File Changes: {{file_changes}}
     expect(request?.systemPrompt).not.toContain(`Question: ${formattedQuestion}`);
   });
 
+  it('substitutes structured metadata, input_object, and rubrics variables', async () => {
+    const customPrompt = `
+Metadata: {{metadata_json}}
+Input Object: {{input_object_json}}
+Rubrics: {{rubrics_json}}
+Candidate: {{output}}
+`;
+
+    const graderProvider = new CapturingProvider({
+      output: [
+        {
+          role: 'assistant',
+          content: JSON.stringify({
+            checks: [{ id: 'factual', satisfied: true, reasoning: 'Matches' }],
+            overall_reasoning: 'OK',
+          }),
+        },
+      ],
+    });
+
+    const evaluator = new LlmGrader({
+      resolveGraderProvider: async () => graderProvider,
+      graderTemplate: customPrompt,
+    });
+
+    await evaluator.evaluate({
+      evalCase: {
+        ...baseTestCase,
+        inputObject: { company: 'Apple', ticker: 'AAPL' },
+        metadata: { source_repo: 'https://github.com/virattt/dexter' },
+      },
+      candidate: 'Apple revenue increased.',
+      target: baseTarget,
+      provider: graderProvider,
+      attempt: 0,
+      promptInputs: { question: 'Research Apple' },
+      now: new Date(),
+      evaluator: {
+        name: 'dexter',
+        type: 'llm-grader',
+        rubrics: [
+          {
+            id: 'factual',
+            operator: 'correctness',
+            outcome: 'Uses the supplied ticker',
+            weight: 1,
+            required: true,
+          },
+        ],
+      },
+    });
+
+    const prompt = graderProvider.lastRequest?.question ?? '';
+    expect(prompt).toContain('"source_repo":"https://github.com/virattt/dexter"');
+    expect(prompt).toContain('"ticker":"AAPL"');
+    expect(prompt).toContain('"operator":"correctness"');
+    expect(prompt).toContain('Candidate: Apple revenue increased.');
+  });
+
   it('deprecated _text aliases still resolve correctly', async () => {
     const formattedQuestion = 'What is 2+2?';
     const customPrompt = `
diff --git a/packages/core/test/evaluation/graders/prompt-resolution.test.ts b/packages/core/test/evaluation/graders/prompt-resolution.test.ts
index 02ef7d946..236acf9cb 100644
--- a/packages/core/test/evaluation/graders/prompt-resolution.test.ts
+++ b/packages/core/test/evaluation/graders/prompt-resolution.test.ts
@@ -26,6 +26,14 @@ describe('containsTemplateVariables', () => {
     expect(containsTemplateVariables('Review {{file_changes}}')).toBe(true);
   });
 
+  it('returns true for structured template variables', () => {
+    expect(
+      containsTemplateVariables(
+        'Review {{metadata_json}}, {{input_object_json}}, and {{rubrics_json}} against {{output}}',
+      ),
+    ).toBe(true);
+  });
+
   it('returns true for deprecated {{output_text}} variable', () => {
     expect(containsTemplateVariables('Grade the {{output_text}}')).toBe(true);
   });
diff --git a/packages/core/test/evaluation/yaml-parser-metadata.test.ts b/packages/core/test/evaluation/yaml-parser-metadata.test.ts
index 12dea2bde..08280300b 100644
--- a/packages/core/test/evaluation/yaml-parser-metadata.test.ts
+++ b/packages/core/test/evaluation/yaml-parser-metadata.test.ts
@@ -173,4 +173,59 @@ tests:
       owasp_llm_top_10_2025: ['LLM01'],
     });
   });
+
+  it('merges arbitrary suite metadata into each case and lets case scalars override', async () => {
+    const { filePath, dir } = createTempYaml(`
+metadata:
+  source_repo: https://github.com/virattt/dexter
+  source_commit: 8d9419829f443f84b804d033bb2c3b1fbd788629
+  source_file: src/evals/dataset/finance_agent.csv
+  tags: [suite]
+tests:
+  - id: case-1
+    criteria: "Answer"
+    input: "Query"
+    metadata:
+      source_file: override.csv
+      tags: [case]
+`);
+
+    const suite = await loadTestSuite(filePath, dir);
+    expect(suite.tests[0].metadata).toMatchObject({
+      source_repo: 'https://github.com/virattt/dexter',
+      source_commit: '8d9419829f443f84b804d033bb2c3b1fbd788629',
+      source_file: 'override.csv',
+      tags: ['suite', 'case'],
+    });
+  });
+
+  it('loads structured input_object and rubric criteria aliases', async () => {
+    const { filePath, dir } = createTempYaml(`
+tests:
+  - id: case-1
+    input: "Research Apple"
+    input_object:
+      company: Apple
+      ticker: AAPL
+    assertions:
+      - name: dexter_rubric
+        type: llm-grader
+        rubrics:
+          - id: factual
+            operator: correctness
+            criteria: "Uses the supplied company and ticker"
+`);
+
+    const suite = await loadTestSuite(filePath, dir);
+    expect(suite.tests[0].inputObject).toEqual({ company: 'Apple', ticker: 'AAPL' });
+    const grader = suite.tests[0].assertions?.[0];
+    expect(grader?.type).toBe('llm-grader');
+    if (grader?.type === 'llm-grader') {
+      expect(grader.rubrics?.[0]).toMatchObject({
+        id: 'factual',
+        operator: 'correctness',
+        outcome: 'Uses the supplied company and ticker',
+      });
+    }
+  });
 });
diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts
index 374ca6651..d92e5604f 100644
--- a/packages/eval/src/schemas.ts
+++ b/packages/eval/src/schemas.ts
@@ -276,6 +276,8 @@ export const CodeGraderInputSchema = z.object({
   outputPath: z.string().optional(),
   inputFiles: z.array(z.string()),
   input: z.array(MessageSchema),
+  inputObject: z.unknown().nullable().optional(),
+  metadata: z.record(z.unknown()).nullable().optional(),
   trace: TraceSummarySchema.nullable().optional(),
   tokenUsage: TokenUsageSchema.nullable().optional(),
   costUsd: z.number().nullable().optional(),

From dcdde97d0c488e9d9266dec299cd12b0881073fd Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 10 Jun 2026 05:24:57 +0200
Subject: [PATCH 2/3] docs: clarify optional llm grader input object

---
 apps/web/src/content/docs/docs/graders/llm-graders.mdx | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
index cc40ab2c1..85b78145f 100644
--- a/apps/web/src/content/docs/docs/graders/llm-graders.mdx
+++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
@@ -74,8 +74,8 @@ Score the response from 0.0 to 1.0 based on:
 | `output` | Candidate answer text |
 | `metadata` | Test metadata as formatted JSON |
 | `metadata_json` | Test metadata as compact JSON |
-| `input_object` | Test `input_object` as formatted JSON |
-| `input_object_json` | Test `input_object` as compact JSON |
+| `input_object` | Optional grader-only `input_object` payload as formatted JSON |
+| `input_object_json` | Optional grader-only `input_object` payload as compact JSON |
 | `rubrics` | LLM-grader rubric items as formatted JSON |
 | `rubrics_json` | LLM-grader rubric items as compact JSON |
 | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
@@ -83,7 +83,9 @@ Score the response from 0.0 to 1.0 based on:
 
 Use `prompt: file://path/to/prompt.md` to reuse a markdown prompt file. Bare `prompt: "..."` strings are treated as inline prompt text, not file paths.
 
-Suite-level `metadata` is inherited by every test. When rubric items vary per test, keep the grader on each test and reuse the prompt file:
+`input_object` is optional grader-only structured data. Use it when the grader prompt needs a stable machine-readable payload through `{{input_object_json}}` while the agent-facing `input` remains a natural-language prompt or message array. If your existing `input` already contains the needed object (for example, a message whose `content` is a JSON object), keep using `input` and reference `{{input}}`; you do not need to duplicate that data into `input_object`.
+
+Suite-level `metadata` is inherited by every test. When rubric items vary per test and you want a separate grader-only payload, keep the grader on each test and reuse the prompt file:
 
 ```yaml
 metadata:
@@ -264,7 +266,7 @@ Derived strings injected into grader prompts:
 | `expected_output` | Reference answer text |
 | `output` | Candidate answer text |
 | `metadata_json` | Test metadata, compact JSON |
-| `input_object_json` | Structured test input object, compact JSON |
+| `input_object_json` | Optional grader-only structured payload, compact JSON |
 | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
 | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) |
 

From 02bd6982d03ee9faede6b94a478d271ee1a76660 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 10 Jun 2026 05:48:39 +0200
Subject: [PATCH 3/3] refactor(core): reuse input for structured llm grader
 data

---
 .../web/src/content/docs/docs/graders/llm-graders.mdx | 11 +++--------
 packages/core/src/evaluation/graders/code-grader.ts   |  1 -
 .../core/src/evaluation/graders/llm-grader-prompt.ts  |  2 --
 packages/core/src/evaluation/graders/llm-grader.ts    |  2 --
 .../core/src/evaluation/graders/prompt-resolution.ts  |  1 -
 .../src/evaluation/loaders/shorthand-expansion.ts     | 10 ++++++++++
 packages/core/src/evaluation/template-variables.ts    |  4 ----
 packages/core/src/evaluation/types.ts                 |  2 --
 packages/core/src/evaluation/yaml-parser.ts           |  8 +-------
 .../core/test/evaluation/evaluators_variables.test.ts | 10 +++++-----
 .../test/evaluation/graders/prompt-resolution.test.ts |  2 +-
 .../core/test/evaluation/yaml-parser-metadata.test.ts |  8 ++++----
 packages/eval/src/schemas.ts                          |  1 -
 13 files changed, 24 insertions(+), 38 deletions(-)

diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
index 85b78145f..ec5241805 100644
--- a/apps/web/src/content/docs/docs/graders/llm-graders.mdx
+++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
@@ -74,8 +74,6 @@ Score the response from 0.0 to 1.0 based on:
 | `output` | Candidate answer text |
 | `metadata` | Test metadata as formatted JSON |
 | `metadata_json` | Test metadata as compact JSON |
-| `input_object` | Optional grader-only `input_object` payload as formatted JSON |
-| `input_object_json` | Optional grader-only `input_object` payload as compact JSON |
 | `rubrics` | LLM-grader rubric items as formatted JSON |
 | `rubrics_json` | LLM-grader rubric items as compact JSON |
 | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
@@ -83,9 +81,9 @@ Score the response from 0.0 to 1.0 based on:
 
 Use `prompt: file://path/to/prompt.md` to reuse a markdown prompt file. Bare `prompt: "..."` strings are treated as inline prompt text, not file paths.
 
-`input_object` is optional grader-only structured data. Use it when the grader prompt needs a stable machine-readable payload through `{{input_object_json}}` while the agent-facing `input` remains a natural-language prompt or message array. If your existing `input` already contains the needed object (for example, a message whose `content` is a JSON object), keep using `input` and reference `{{input}}`; you do not need to duplicate that data into `input_object`.
+Structured task input belongs in `input`. If `input` is a message whose `content` is a JSON object, `{{input}}` renders that object as formatted JSON for the grader prompt; no separate grader-only input field is required. Use `metadata` for provenance or suite-level source fields, and `rubrics_json` for rubric arrays.
 
-Suite-level `metadata` is inherited by every test. When rubric items vary per test and you want a separate grader-only payload, keep the grader on each test and reuse the prompt file:
+Suite-level `metadata` is inherited by every test. When rubric items vary per test, keep the grader on each test and reuse the prompt file:
 
 ```yaml
 metadata:
@@ -95,8 +93,7 @@ metadata:
 
 tests:
   - id: apple-research
-    input: Research Apple
-    input_object:
+    input:
       company: Apple
       ticker: AAPL
     metadata:
@@ -228,7 +225,6 @@ TypeScript templates receive a context object with these fields:
 | `expectedOutput` | `Message[]` | Full resolved expected output |
 | `output` | `Message[]` | Full provider output messages |
 | `trace` | `TraceSummary` | Execution metrics summary |
-| `inputObject` | `unknown` | Optional structured `input_object` payload |
 | `metadata` | `object` | Test metadata after suite defaults are merged |
 | `config` | `object` | Custom config from YAML |
 
@@ -266,7 +262,6 @@ Derived strings injected into grader prompts:
 | `expected_output` | Reference answer text |
 | `output` | Candidate answer text |
 | `metadata_json` | Test metadata, compact JSON |
-| `input_object_json` | Optional grader-only structured payload, compact JSON |
 | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
 | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) |
 
diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts
index 1aca3abd6..3ec89061b 100644
--- a/packages/core/src/evaluation/graders/code-grader.ts
+++ b/packages/core/src/evaluation/graders/code-grader.ts
@@ -168,7 +168,6 @@ export class CodeGrader implements Grader {
         context.evalCase.input as readonly Record<string, unknown>[],
         getImageDir,
       ),
-      inputObject: context.evalCase.inputObject ?? null,
       metadata: context.evalCase.metadata ?? null,
       trace: context.trace ?? null,
       tokenUsage: context.tokenUsage ?? null,
diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts
index fc12ff0ef..fe79c525e 100644
--- a/packages/core/src/evaluation/graders/llm-grader-prompt.ts
+++ b/packages/core/src/evaluation/graders/llm-grader-prompt.ts
@@ -46,8 +46,6 @@ function buildTemplateVariables(input: {
     [TEMPLATE_VARIABLES.CRITERIA]: input.evalCase.criteria.trim(),
     [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(input.evalCase.metadata),
     [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(input.evalCase.metadata),
-    [TEMPLATE_VARIABLES.INPUT_OBJECT]: stringifyPretty(input.evalCase.inputObject),
-    [TEMPLATE_VARIABLES.INPUT_OBJECT_JSON]: stringifyCompact(input.evalCase.inputObject),
     [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(input.rubrics),
     [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(input.rubrics),
     [TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? '',
diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts
index 5092da81f..acdf85248 100644
--- a/packages/core/src/evaluation/graders/llm-grader.ts
+++ b/packages/core/src/evaluation/graders/llm-grader.ts
@@ -177,8 +177,6 @@ function buildTemplateVariables(context: EvaluationContext): Record<string, stri
     [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
     [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(context.evalCase.metadata),
     [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(context.evalCase.metadata),
-    [TEMPLATE_VARIABLES.INPUT_OBJECT]: stringifyPretty(context.evalCase.inputObject),
-    [TEMPLATE_VARIABLES.INPUT_OBJECT_JSON]: stringifyCompact(context.evalCase.inputObject),
     [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics),
     [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics),
     [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
diff --git a/packages/core/src/evaluation/graders/prompt-resolution.ts b/packages/core/src/evaluation/graders/prompt-resolution.ts
index 0e90fd47f..b31717047 100644
--- a/packages/core/src/evaluation/graders/prompt-resolution.ts
+++ b/packages/core/src/evaluation/graders/prompt-resolution.ts
@@ -103,7 +103,6 @@ async function executePromptTemplate(
     output: context.output ?? null,
     inputFiles: context.evalCase.file_paths,
     input: context.evalCase.input,
-    inputObject: context.evalCase.inputObject ?? null,
     metadata: context.evalCase.metadata ?? null,
     trace: context.trace ?? null,
     fileChanges: context.fileChanges ?? null,
diff --git a/packages/core/src/evaluation/loaders/shorthand-expansion.ts b/packages/core/src/evaluation/loaders/shorthand-expansion.ts
index bcc4e37f0..a8fbd9008 100644
--- a/packages/core/src/evaluation/loaders/shorthand-expansion.ts
+++ b/packages/core/src/evaluation/loaders/shorthand-expansion.ts
@@ -15,6 +15,7 @@ import { isJsonObject, isTestMessage } from '../types.js';
  *
  * Supports:
  * - String: "What is 2+2?" -> [{ role: 'user', content: "What is 2+2?" }]
+ * - Object (without role key): { accuracy: 0.9 } -> [{ role: 'user', content: { accuracy: 0.9 } }]
  * - Array of messages: Already in message format, passthrough
  *
  * @param value The raw `input` value from YAML/JSONL
@@ -30,6 +31,15 @@ export function expandInputShorthand(value: JsonValue | undefined): TestMessage[
     return [{ role: 'user', content: value }];
   }
 
+  // Object shorthand: single user message with structured content.
+  // If it already looks like a message, preserve the existing message shape.
+  if (isJsonObject(value)) {
+    if ('role' in value) {
+      return isTestMessage(value) ? [value] : undefined;
+    }
+    return [{ role: 'user', content: value }];
+  }
+
   // Array: should be message array
   if (Array.isArray(value)) {
     const messages = value.filter((msg): msg is TestMessage => isTestMessage(msg));
diff --git a/packages/core/src/evaluation/template-variables.ts b/packages/core/src/evaluation/template-variables.ts
index 0fd268e09..81e13c36a 100644
--- a/packages/core/src/evaluation/template-variables.ts
+++ b/packages/core/src/evaluation/template-variables.ts
@@ -9,8 +9,6 @@
  *   - {{ criteria }}        — evaluation criteria string
  *   - {{ metadata }}        — per-test metadata as formatted JSON
  *   - {{ metadata_json }}   — per-test metadata as compact JSON
- *   - {{ input_object }}      — per-test structured input object as formatted JSON
- *   - {{ input_object_json }} — per-test structured input object as compact JSON
  *   - {{ rubrics }}        — llm-grader rubrics as formatted JSON
  *   - {{ rubrics_json }}   — llm-grader rubrics as compact JSON
  *   - {{ file_changes }}    — file diff (if available)
@@ -26,8 +24,6 @@ export const TEMPLATE_VARIABLES = {
   CRITERIA: 'criteria',
   METADATA: 'metadata',
   METADATA_JSON: 'metadata_json',
-  INPUT_OBJECT: 'input_object',
-  INPUT_OBJECT_JSON: 'input_object_json',
   RUBRICS: 'rubrics',
   RUBRICS_JSON: 'rubrics_json',
   INPUT: 'input',
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index 1b31f5bfc..7e764ff8b 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -1009,8 +1009,6 @@ export interface EvalTest {
   readonly conversation_id?: string;
   readonly question: string;
   readonly input: readonly TestMessage[];
-  /** Optional structured per-case input payload for grader prompt templates. */
-  readonly inputObject?: JsonValue;
   readonly expected_output: readonly JsonObject[];
   readonly reference_answer?: string;
   readonly file_paths: readonly string[];
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index 35c9c34e9..c82592992 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -64,7 +64,7 @@ import type {
   WorkspaceHooksConfig,
   WorkspaceScriptConfig,
 } from './types.js';
-import { isJsonObject, isJsonValue, isTestMessage } from './types.js';
+import { isJsonObject, isTestMessage } from './types.js';
 import { parseRepoConfig } from './workspace/repo-config-parser.js';
 import { parseYamlValue } from './yaml-loader.js';
 
@@ -142,7 +142,6 @@ type RawEvalCase = JsonObject & {
   /** @deprecated Use `criteria` instead */
   readonly expected_outcome?: JsonValue;
   readonly input?: JsonValue;
-  readonly input_object?: JsonValue;
   /** Shorthand: list of file paths to prepend as type:file content blocks in the user message. */
   readonly input_files?: JsonValue;
   readonly expected_output?: JsonValue;
@@ -635,10 +634,6 @@ async function loadTestsFromYaml(
       ? (renderedCase.metadata as Record<string, unknown>)
       : undefined;
     const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload);
-    const inputObject = isJsonValue(renderedCase.input_object)
-      ? renderedCase.input_object
-      : undefined;
-
     // Extract per-test targets override (matrix evaluation)
     const caseTargets = extractTargetsFromTestCase(renderedCase as JsonObject);
 
@@ -683,7 +678,6 @@ async function loadTestsFromYaml(
       conversation_id: conversationId,
       question: question,
       input: inputMessages,
-      ...(inputObject !== undefined ? { inputObject } : {}),
       expected_output: outputSegments,
       reference_answer: referenceAnswer,
       file_paths: userFilePaths,
diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts
index 008e8d80f..835084494 100644
--- a/packages/core/test/evaluation/evaluators_variables.test.ts
+++ b/packages/core/test/evaluation/evaluators_variables.test.ts
@@ -96,10 +96,10 @@ File Changes: {{file_changes}}
     expect(request?.systemPrompt).not.toContain(`Question: ${formattedQuestion}`);
   });
 
-  it('substitutes structured metadata, input_object, and rubrics variables', async () => {
+  it('substitutes structured input, metadata, and rubrics variables', async () => {
     const customPrompt = `
 Metadata: {{metadata_json}}
-Input Object: {{input_object_json}}
+Input: {{input}}
 Rubrics: {{rubrics_json}}
 Candidate: {{output}}
 `;
@@ -124,14 +124,14 @@ Candidate: {{output}}
     await evaluator.evaluate({
       evalCase: {
         ...baseTestCase,
-        inputObject: { company: 'Apple', ticker: 'AAPL' },
+        input: [{ role: 'user', content: { company: 'Apple', ticker: 'AAPL' } }],
         metadata: { source_repo: 'https://github.com/virattt/dexter' },
       },
       candidate: 'Apple revenue increased.',
       target: baseTarget,
       provider: graderProvider,
       attempt: 0,
-      promptInputs: { question: 'Research Apple' },
+      promptInputs: { question: '{\n  "company": "Apple",\n  "ticker": "AAPL"\n}' },
       now: new Date(),
       evaluator: {
         name: 'dexter',
@@ -150,7 +150,7 @@ Candidate: {{output}}
 
     const prompt = graderProvider.lastRequest?.question ?? '';
     expect(prompt).toContain('"source_repo":"https://github.com/virattt/dexter"');
-    expect(prompt).toContain('"ticker":"AAPL"');
+    expect(prompt).toContain('"ticker": "AAPL"');
     expect(prompt).toContain('"operator":"correctness"');
     expect(prompt).toContain('Candidate: Apple revenue increased.');
   });
diff --git a/packages/core/test/evaluation/graders/prompt-resolution.test.ts b/packages/core/test/evaluation/graders/prompt-resolution.test.ts
index 236acf9cb..1c17cec2f 100644
--- a/packages/core/test/evaluation/graders/prompt-resolution.test.ts
+++ b/packages/core/test/evaluation/graders/prompt-resolution.test.ts
@@ -29,7 +29,7 @@ describe('containsTemplateVariables', () => {
   it('returns true for structured template variables', () => {
     expect(
       containsTemplateVariables(
-        'Review {{metadata_json}}, {{input_object_json}}, and {{rubrics_json}} against {{output}}',
+        'Review {{metadata_json}} and {{rubrics_json}} against {{input}} and {{output}}',
       ),
     ).toBe(true);
   });
diff --git a/packages/core/test/evaluation/yaml-parser-metadata.test.ts b/packages/core/test/evaluation/yaml-parser-metadata.test.ts
index 08280300b..0d46e7867 100644
--- a/packages/core/test/evaluation/yaml-parser-metadata.test.ts
+++ b/packages/core/test/evaluation/yaml-parser-metadata.test.ts
@@ -199,12 +199,11 @@ tests:
     });
   });
 
-  it('loads structured input_object and rubric criteria aliases', async () => {
+  it('loads structured input objects and rubric criteria aliases', async () => {
     const { filePath, dir } = createTempYaml(`
 tests:
   - id: case-1
-    input: "Research Apple"
-    input_object:
+    input:
       company: Apple
       ticker: AAPL
     assertions:
@@ -217,7 +216,8 @@ tests:
 `);
 
     const suite = await loadTestSuite(filePath, dir);
-    expect(suite.tests[0].inputObject).toEqual({ company: 'Apple', ticker: 'AAPL' });
+    expect(suite.tests[0].input[0].content).toEqual({ company: 'Apple', ticker: 'AAPL' });
+    expect(suite.tests[0].question).toContain('"ticker": "AAPL"');
     const grader = suite.tests[0].assertions?.[0];
     expect(grader?.type).toBe('llm-grader');
     if (grader?.type === 'llm-grader') {
diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts
index d92e5604f..4d2eb340e 100644
--- a/packages/eval/src/schemas.ts
+++ b/packages/eval/src/schemas.ts
@@ -276,7 +276,6 @@ export const CodeGraderInputSchema = z.object({
   outputPath: z.string().optional(),
   inputFiles: z.array(z.string()),
   input: z.array(MessageSchema),
-  inputObject: z.unknown().nullable().optional(),
   metadata: z.record(z.unknown()).nullable().optional(),
   trace: TraceSummarySchema.nullable().optional(),
   tokenUsage: TokenUsageSchema.nullable().optional(),