EntityProcess · christso · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
@@ -29,7 +29,7 @@ Reference an LLM grader in your eval file:
 assertions:
   - name: semantic_check
     type: llm-grader
-    prompt: ./graders/correctness.md
+    prompt: file://graders/correctness.md
     target: grader_gpt_5_mini   # optional: route this grader to a named LLM target
 ```
 
@@ -69,12 +69,44 @@ Score the response from 0.0 to 1.0 based on:
 | `output_text` | Last candidate response content |
 | `expected_output_text` | Last expected message content |
 | `criteria` | Test `criteria` field |
-| `input` | Full resolved input array, JSON-serialized |
-| `expected_output` | Full resolved expected array, JSON-serialized |
-| `output` | Full provider output array, JSON-serialized |
+| `input` | Resolved input text |
+| `expected_output` | Reference answer text |
+| `output` | Candidate answer text |
+| `metadata` | Test metadata as formatted JSON |
+| `metadata_json` | Test metadata as compact JSON |
+| `rubrics` | LLM-grader rubric items as formatted JSON |
+| `rubrics_json` | LLM-grader rubric items as compact JSON |
 | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
 | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) |
 
+Use `prompt: file://path/to/prompt.md` to reuse a markdown prompt file. Bare `prompt: "..."` strings are treated as inline prompt text, not file paths.
+
+Structured task input belongs in `input`. If `input` is a message whose `content` is a JSON object, `{{input}}` renders that object as formatted JSON for the grader prompt; no separate grader-only input field is required. Use `metadata` for provenance or suite-level source fields, and `rubrics_json` for rubric arrays.
+
+Suite-level `metadata` is inherited by every test. When rubric items vary per test, keep the grader on each test and reuse the prompt file:
+
+```yaml
+metadata:
+  source_repo: https://github.com/virattt/dexter
+  source_commit: 8d9419829f443f84b804d033bb2c3b1fbd788629
+  source_file: src/evals/dataset/finance_agent.csv
+
+tests:
+  - id: apple-research
+    input:
+      company: Apple
+      ticker: AAPL
+    metadata:
+      row: 1
+    assertions:
+      - name: dexter_semantic
+        type: llm-grader
+        prompt: file://prompts/dexter-grader.md
+        rubrics:
+          - operator: correctness
+            criteria: Uses the provided ticker and company.
+```
+
 ## Per-Grader Target
 
 By default, an `llm-grader` uses the suite target's `grader_target`. Override it per grader when you need multiple grader models in one run:
@@ -193,6 +225,7 @@ TypeScript templates receive a context object with these fields:
 | `expectedOutput` | `Message[]` | Full resolved expected output |
 | `output` | `Message[]` | Full provider output messages |
 | `trace` | `TraceSummary` | Execution metrics summary |
+| `metadata` | `object` | Test metadata after suite defaults are merged |
 | `config` | `object` | Custom config from YAML |
 
 ## Template Variable Derivation
@@ -225,9 +258,10 @@ Derived strings injected into grader prompts:
 | `criteria` | Passed through from the test field |
 | `expected_output_text` | Content of the last entry in `expected_output` |
 | `output_text` | Content of the last entry in `output` |
-| `input` | Full resolved input array, JSON-serialized |
-| `expected_output` | Full resolved expected array, JSON-serialized |
-| `output` | Full provider output array, JSON-serialized |
+| `input` | Resolved input text |
+| `expected_output` | Reference answer text |
+| `output` | Candidate answer text |
+| `metadata_json` | Test metadata, compact JSON |
 | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) |
 | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) |
 

diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts
@@ -168,6 +168,7 @@ export class CodeGrader implements Grader {
         context.evalCase.input as readonly Record<string, unknown>[],
         getImageDir,
       ),
+      metadata: context.evalCase.metadata ?? null,
       trace: context.trace ?? null,
       tokenUsage: context.tokenUsage ?? null,
       costUsd: context.costUsd ?? null,

diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts
@@ -18,6 +18,44 @@ export interface LlmGraderPromptAssembly {
   mode: 'freeform' | 'checklist' | 'score_range';
 }
 
+function stringifyPretty(value: unknown): string {
+  return value === undefined ? '' : JSON.stringify(value, null, 2);
+}
+
+function stringifyCompact(value: unknown): string {
+  return value === undefined ? '' : JSON.stringify(value);
+}
+
+function buildTemplateVariables(input: {
+  evalCase: EvalTest;
+  candidate: string;
+  promptInputs: PromptInputs;
+  rubrics?: readonly RubricItem[];
+  fileChanges?: string;
+  toolCalls?: string;
+}): Record<string, string> {
+  const formattedQuestion =
+    input.promptInputs.question && input.promptInputs.question.trim().length > 0
+      ? input.promptInputs.question
+      : input.evalCase.question;
+
+  return {
+    [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+    [TEMPLATE_VARIABLES.OUTPUT]: input.candidate.trim(),
+    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (input.evalCase.reference_answer ?? '').trim(),
+    [TEMPLATE_VARIABLES.CRITERIA]: input.evalCase.criteria.trim(),
+    [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(input.evalCase.metadata),
+    [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(input.evalCase.metadata),
+    [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(input.rubrics),
+    [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(input.rubrics),
+    [TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? '',
+    [TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? '',
+    [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
+    [TEMPLATE_VARIABLES.OUTPUT_TEXT]: input.candidate.trim(),
+    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (input.evalCase.reference_answer ?? '').trim(),
+  };
+}
+
 export function assembleLlmGraderPrompt(input: {
   evalCase: EvalTest;
   candidate: string;
@@ -42,6 +80,17 @@ export function assembleLlmGraderPrompt(input: {
 
   // Detect mode
   if (rubrics && rubrics.length > 0) {
+    if (graderTemplateOverride) {
+      return assembleCustom(
+        evalCase,
+        candidate,
+        promptInputs,
+        rubrics,
+        fileChanges,
+        toolCalls,
+        graderTemplateOverride,
+      );
+    }
     const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
     if (hasScoreRanges) {
       return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
@@ -67,23 +116,13 @@ function assembleFreeform(
   toolCalls?: string,
   graderTemplateOverride?: string,
 ): LlmGraderPromptAssembly {
-  const formattedQuestion =
-    promptInputs.question && promptInputs.question.trim().length > 0
-      ? promptInputs.question
-      : evalCase.question;
-
-  const variables = {
-    [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
-    [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
-    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? '').trim(),
-    [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
-    [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? '',
-    [TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? '',
-    // Deprecated aliases
-    [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
-    [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
-    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? '').trim(),
-  };
+  const variables = buildTemplateVariables({
+    evalCase,
+    candidate,
+    promptInputs,
+    fileChanges,
+    toolCalls,
+  });
 
   const systemPrompt = buildOutputSchema();
   const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
@@ -105,6 +144,37 @@ function assembleFreeform(
   };
 }
 
+function assembleCustom(
+  evalCase: EvalTest,
+  candidate: string,
+  promptInputs: PromptInputs,
+  rubrics: readonly RubricItem[],
+  fileChanges: string | undefined,
+  toolCalls: string | undefined,
+  graderTemplateOverride: string,
+): LlmGraderPromptAssembly {
+  const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
+  const systemPrompt = hasScoreRanges ? buildScoreRangeOutputSchema() : buildRubricOutputSchema();
+  const userPrompt = substituteVariables(
+    graderTemplateOverride,
+    buildTemplateVariables({
+      evalCase,
+      candidate,
+      promptInputs,
+      rubrics,
+      fileChanges,
+      toolCalls,
+    }),
+  );
+
+  return {
+    systemPrompt,
+    userPrompt,
+    responseSchema: systemPrompt,
+    mode: hasScoreRanges ? 'score_range' : 'checklist',
+  };
+}
+
 function assembleChecklist(
   evalCase: EvalTest,
   candidate: string,

diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts
@@ -155,6 +155,39 @@ interface StructuredGenerationResult {
   readonly tokenUsage?: TokenUsage;
 }
 
+function stringifyPretty(value: unknown): string {
+  return value === undefined ? '' : JSON.stringify(value, null, 2);
+}
+
+function stringifyCompact(value: unknown): string {
+  return value === undefined ? '' : JSON.stringify(value);
+}
+
+function buildTemplateVariables(context: EvaluationContext): Record<string, string> {
+  const formattedQuestion =
+    context.promptInputs.question && context.promptInputs.question.trim().length > 0
+      ? context.promptInputs.question
+      : context.evalCase.question;
+  const rubrics = context.evaluator?.type === 'llm-grader' ? context.evaluator.rubrics : undefined;
+
+  return {
+    [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+    [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
+    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(),
+    [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
+    [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(context.evalCase.metadata),
+    [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(context.evalCase.metadata),
+    [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics),
+    [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics),
+    [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
+    [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '',
+    // Deprecated aliases — same values as the primary variables above
+    [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
+    [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
+    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(),
+  };
+}
+
 function resolveContentBasePath(context: EvaluationContext): string | undefined {
   if (context.workspacePath) {
     return context.workspacePath;
@@ -259,25 +292,7 @@ export class LlmGrader implements Grader {
     context: EvaluationContext,
     graderProvider: Provider,
   ): Promise<EvaluationScore> {
-    const formattedQuestion =
-      context.promptInputs.question && context.promptInputs.question.trim().length > 0
-        ? context.promptInputs.question
-        : context.evalCase.question;
-
-    // Prepare template variables for substitution.
-    // Primary variables resolve to human-readable text; deprecated _text aliases map to the same values.
-    const variables = {
-      [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
-      [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
-      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(),
-      [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
-      [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
-      [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '',
-      // Deprecated aliases — same values as the primary variables above
-      [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
-      [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
-      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(),
-    };
+    const variables = buildTemplateVariables(context);
 
     // Build system prompt (only the mandatory output schema)
     const systemPrompt = buildOutputSchema();
@@ -367,7 +382,10 @@ export class LlmGrader implements Grader {
       return this.evaluateWithScoreRanges(context, graderProvider, rubrics);
     }
 
-    const prompt = this.buildRubricPrompt(context, rubrics);
+    const prompt =
+      context.graderTemplateOverride || this.graderTemplate
+        ? this.buildCustomPrompt(context)
+        : this.buildRubricPrompt(context, rubrics);
     const systemPrompt = buildRubricOutputSchema();
 
     const graderRawRequest: JsonObject = {
@@ -423,7 +441,10 @@ export class LlmGrader implements Grader {
     graderProvider: Provider,
     rubrics: readonly RubricItem[],
   ): Promise<EvaluationScore> {
-    const prompt = this.buildScoreRangePrompt(context, rubrics);
+    const prompt =
+      context.graderTemplateOverride || this.graderTemplate
+        ? this.buildCustomPrompt(context)
+        : this.buildScoreRangePrompt(context, rubrics);
     const systemPrompt = buildScoreRangeOutputSchema();
 
     const graderRawRequest: JsonObject = {
@@ -688,22 +709,12 @@ export class LlmGrader implements Grader {
         ? context.promptInputs.question
         : context.evalCase.question;
 
-    const variables: Record<string, string> = {
-      [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
-      [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
-      [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
-      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(),
-      [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
-      [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '',
-      // Deprecated aliases
-      [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
-      [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
-      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(),
-    };
+    const variables = buildTemplateVariables(context);
 
-    if (this.graderTemplate) {
-      warnDeprecatedTemplateVars(this.graderTemplate);
-      return substituteVariables(this.graderTemplate, variables);
+    const template = context.graderTemplateOverride ?? this.graderTemplate;
+    if (template) {
+      warnDeprecatedTemplateVars(template);
+      return substituteVariables(template, variables);
     }
 
     const config = context.evaluator;
@@ -767,21 +778,11 @@ export class LlmGrader implements Grader {
     const config = context.evaluator;
     const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined;
 
-    if (this.graderTemplate) {
-      const variables: Record<string, string> = {
-        [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
-        [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
-        [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
-        [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(),
-        [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
-        [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '',
-        // Deprecated aliases
-        [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
-        [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
-        [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(),
-      };
-      warnDeprecatedTemplateVars(this.graderTemplate);
-      const customPrompt = substituteVariables(this.graderTemplate, variables);
+    const template = context.graderTemplateOverride ?? this.graderTemplate;
+    if (template) {
+      const variables = buildTemplateVariables(context);
+      warnDeprecatedTemplateVars(template);
+      const customPrompt = substituteVariables(template, variables);
 
       const outputSchema =
         rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
@@ -984,6 +985,12 @@ export class LlmGrader implements Grader {
     return parts.join('\n');
   }
 
+  private buildCustomPrompt(context: EvaluationContext): string {
+    const template = context.graderTemplateOverride ?? this.graderTemplate ?? '';
+    warnDeprecatedTemplateVars(template);
+    return substituteVariables(template, buildTemplateVariables(context));
+  }
+
   private buildRubricPrompt(context: EvaluationContext, rubrics: readonly RubricItem[]): string {
     const formattedQuestion =
       context.promptInputs.question && context.promptInputs.question.trim().length > 0

diff --git a/packages/core/src/evaluation/graders/prompt-resolution.ts b/packages/core/src/evaluation/graders/prompt-resolution.ts
@@ -103,6 +103,7 @@ async function executePromptTemplate(
     output: context.output ?? null,
     inputFiles: context.evalCase.file_paths,
     input: context.evalCase.input,
+    metadata: context.evalCase.metadata ?? null,
     trace: context.trace ?? null,
     fileChanges: context.fileChanges ?? null,
     workspacePath: context.workspacePath ?? null,

diff --git a/packages/core/src/evaluation/loaders/grader-parser.ts b/packages/core/src/evaluation/loaders/grader-parser.ts
@@ -2143,7 +2143,7 @@ function parseRubricItems(
     }
 
     const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
-    const expectedOutcome = asString(rawRubric.outcome) ?? '';
+    const expectedOutcome = asString(rawRubric.outcome) ?? asString(rawRubric.criteria) ?? '';
     const operator = parseRubricOperator(rawRubric.operator, id, evaluatorName, evalId);
     const weight = typeof rawRubric.weight === 'number' ? rawRubric.weight : 1.0;