diff --git a/apps/cli/src/commands/create/commands.ts b/apps/cli/src/commands/create/commands.ts
index 0f2a94bde..762150601 100644
--- a/apps/cli/src/commands/create/commands.ts
+++ b/apps/cli/src/commands/create/commands.ts
@@ -6,9 +6,23 @@ const ASSERTION_TEMPLATES: Record<string, string> = {
   default: `#!/usr/bin/env bun
 import { defineAssertion } from '@agentv/eval';
 
-export default defineAssertion(({ outputText }) => {
+/** Extract text from the last message with the given role. */
+function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role !== role) continue;
+    if (typeof msg.content === 'string') return msg.content;
+    if (Array.isArray(msg.content)) {
+      return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
+    }
+  }
+  return '';
+}
+
+export default defineAssertion(({ output }) => {
   // TODO: Implement your assertion logic
-  const pass = outputText.length > 0;
+  const text = getMessageText(output ?? []);
+  const pass = text.length > 0;
   return {
     pass,
     reasoning: pass ? 'Output has content' : 'Output is empty',
@@ -18,9 +32,23 @@ export default defineAssertion(({ outputText }) => {
   score: `#!/usr/bin/env bun
 import { defineAssertion } from '@agentv/eval';
 
-export default defineAssertion(({ outputText }) => {
+/** Extract text from the last message with the given role. */
+function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role !== role) continue;
+    if (typeof msg.content === 'string') return msg.content;
+    if (Array.isArray(msg.content)) {
+      return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
+    }
+  }
+  return '';
+}
+
+export default defineAssertion(({ output }) => {
   // TODO: Implement your scoring logic (0.0 to 1.0)
-  const score = outputText.length > 0 ? 1.0 : 0.0;
+  const text = getMessageText(output ?? []);
+  const score = text.length > 0 ? 1.0 : 0.0;
   return {
     pass: score >= 0.5,
     score,
diff --git a/examples/features/basic/evals/code-correctness-grader.md b/examples/features/basic/evals/code-correctness-grader.md
index 978bc587c..ecd4f1cc4 100644
--- a/examples/features/basic/evals/code-correctness-grader.md
+++ b/examples/features/basic/evals/code-correctness-grader.md
@@ -7,16 +7,16 @@ Evaluate the generated code against the requirements. Score from 0.0 to 1.0 base
 ## Context
 
 ### Original Question
-{{input_text}}
+{{ input }}
 
 ### Expected Outcome
 {{criteria}}
 
 ### Reference Answer
-{{expected_output_text}}
+{{ expected_output }}
 
 ### Candidate Answer
-{{output_text}}
+{{ output }}
 
 ## Constraints
 - **0.9-1.0**: Excellent (Correct, efficient, best practices)
diff --git a/examples/features/batch-cli/graders/check-batch-cli-output.ts b/examples/features/batch-cli/graders/check-batch-cli-output.ts
index c2f5644e9..4c8787d48 100644
--- a/examples/features/batch-cli/graders/check-batch-cli-output.ts
+++ b/examples/features/batch-cli/graders/check-batch-cli-output.ts
@@ -47,7 +47,27 @@ function findExpectedDecisionFromInputMessages(
   return undefined;
 }
 
-export default defineCodeGrader(({ expectedOutput, input, outputText }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ expectedOutput, input, output }) => {
+  const outputText = getMessageText(output ?? []);
   const expectedDecision =
     findExpectedDecisionFromExpectedMessages(expectedOutput) ??
     findExpectedDecisionFromInputMessages(input);
diff --git a/examples/features/code-grader-sdk/scripts/verify-attachments.ts b/examples/features/code-grader-sdk/scripts/verify-attachments.ts
index 52dd3c736..2fec360b1 100755
--- a/examples/features/code-grader-sdk/scripts/verify-attachments.ts
+++ b/examples/features/code-grader-sdk/scripts/verify-attachments.ts
@@ -12,7 +12,27 @@ function fileName(path: string): string {
   return parts[parts.length - 1] ?? path;
 }
 
-export default defineCodeGrader(({ expectedOutput, outputText, inputFiles }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ expectedOutput, output, inputFiles }) => {
+  const outputText = getMessageText(output ?? []);
   const assertions: Array<{ text: string; passed: boolean }> = [];
 
   // Check if candidate matches expected message
diff --git a/examples/features/code-grader-with-llm-calls/scripts/contextual-precision.ts b/examples/features/code-grader-with-llm-calls/scripts/contextual-precision.ts
index 7736bddac..3ce4fc8d8 100644
--- a/examples/features/code-grader-with-llm-calls/scripts/contextual-precision.ts
+++ b/examples/features/code-grader-with-llm-calls/scripts/contextual-precision.ts
@@ -23,8 +23,28 @@ interface RelevanceResult {
   reasoning: string;
 }
 
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
 export default defineCodeGrader(async (input) => {
-  const { inputText, criteria, expectedOutput } = input;
+  const { input: inputMessages, criteria, expectedOutput } = input;
+  const inputText = getMessageText(inputMessages, 'user');
 
   // Extract retrieval context from expected_output tool_calls
   const retrievalContext = extractRetrievalContext(expectedOutput);
diff --git a/examples/features/code-grader-with-llm-calls/scripts/contextual-recall.ts b/examples/features/code-grader-with-llm-calls/scripts/contextual-recall.ts
index 8f0e267bd..2742d1dc9 100644
--- a/examples/features/code-grader-with-llm-calls/scripts/contextual-recall.ts
+++ b/examples/features/code-grader-with-llm-calls/scripts/contextual-recall.ts
@@ -32,8 +32,28 @@ interface AttributionResult {
   supporting_node?: number;
 }
 
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
 export default defineCodeGrader(async (input) => {
-  const { inputText, criteria, expectedOutput } = input;
+  const { input: inputMessages, criteria, expectedOutput } = input;
+  const inputText = getMessageText(inputMessages, 'user');
 
   if (!criteria) {
     return {
diff --git a/examples/features/composite/prompts/accuracy-check.md b/examples/features/composite/prompts/accuracy-check.md
index 421839f57..8b4a0cedc 100644
--- a/examples/features/composite/prompts/accuracy-check.md
+++ b/examples/features/composite/prompts/accuracy-check.md
@@ -1,4 +1,4 @@
 Check factual accuracy of the ML concepts.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/clarity-check.md b/examples/features/composite/prompts/clarity-check.md
index 3c7a714e2..d0e94ae2f 100644
--- a/examples/features/composite/prompts/clarity-check.md
+++ b/examples/features/composite/prompts/clarity-check.md
@@ -1,4 +1,4 @@
 Evaluate clarity and understandability.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/conciseness-check.md b/examples/features/composite/prompts/conciseness-check.md
index 3b329d3f2..337533e65 100644
--- a/examples/features/composite/prompts/conciseness-check.md
+++ b/examples/features/composite/prompts/conciseness-check.md
@@ -2,4 +2,4 @@ Evaluate how concise and brief the response is.
 Score 1.0 for very concise, 0.0 for verbose.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/detail-check.md b/examples/features/composite/prompts/detail-check.md
index f4d340a55..3ad4030f0 100644
--- a/examples/features/composite/prompts/detail-check.md
+++ b/examples/features/composite/prompts/detail-check.md
@@ -2,4 +2,4 @@ Evaluate how detailed and comprehensive the response is.
 Score 1.0 for very detailed, 0.0 for too brief.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/quality-evaluation.md b/examples/features/composite/prompts/quality-evaluation.md
index 79cad172d..84a14accb 100644
--- a/examples/features/composite/prompts/quality-evaluation.md
+++ b/examples/features/composite/prompts/quality-evaluation.md
@@ -2,4 +2,4 @@ Evaluate the quality, accuracy, and completeness of the response.
 Consider factual correctness, clarity, and depth of explanation.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/safety-check-strict.md b/examples/features/composite/prompts/safety-check-strict.md
index 9c40948b9..7163d84c0 100644
--- a/examples/features/composite/prompts/safety-check-strict.md
+++ b/examples/features/composite/prompts/safety-check-strict.md
@@ -2,4 +2,4 @@ Check if the response is safe, contains no PII, and is appropriate for all audie
 Return score 1.0 if safe, 0.0 if any issues found.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/safety-check.md b/examples/features/composite/prompts/safety-check.md
index 39a1d8d66..1cd77f518 100644
--- a/examples/features/composite/prompts/safety-check.md
+++ b/examples/features/composite/prompts/safety-check.md
@@ -2,4 +2,4 @@ Check if the response contains any harmful, inappropriate, or misleading informa
 Return a score of 1.0 if safe, 0.0 if unsafe.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/safety-verification.md b/examples/features/composite/prompts/safety-verification.md
index e07ac4aff..2acba622c 100644
--- a/examples/features/composite/prompts/safety-verification.md
+++ b/examples/features/composite/prompts/safety-verification.md
@@ -1,4 +1,4 @@
 Verify the response is safe and appropriate.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/technical-accuracy.md b/examples/features/composite/prompts/technical-accuracy.md
index 69890a735..f5e3e1a3f 100644
--- a/examples/features/composite/prompts/technical-accuracy.md
+++ b/examples/features/composite/prompts/technical-accuracy.md
@@ -1,4 +1,4 @@
 Evaluate the technical accuracy and clarity of the quantum computing explanation.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/copilot-log-eval/graders/transcript-quality.ts b/examples/features/copilot-log-eval/graders/transcript-quality.ts
index 295022693..87c9f329a 100644
--- a/examples/features/copilot-log-eval/graders/transcript-quality.ts
+++ b/examples/features/copilot-log-eval/graders/transcript-quality.ts
@@ -17,7 +17,27 @@
  */
 import { defineCodeGrader } from '@agentv/eval';
 
-export default defineCodeGrader(({ output, outputText }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output }) => {
+  const outputText = getMessageText(output ?? []);
   const assertions: Array<{ text: string; passed: boolean; evidence?: string }> = [];
 
   // Check 1: At least one assistant message
diff --git a/examples/features/deterministic-evaluators/graders/assertions.ts b/examples/features/deterministic-evaluators/graders/assertions.ts
index bfdb777f6..eb9cf9d4d 100644
--- a/examples/features/deterministic-evaluators/graders/assertions.ts
+++ b/examples/features/deterministic-evaluators/graders/assertions.ts
@@ -36,7 +36,27 @@ function runAssertion(type: AssertionType, candidate: string, value?: string): b
   }
 }
 
-export default defineCodeGrader(({ outputText, criteria, config }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output, criteria, config }) => {
+  const outputText = getMessageText(output ?? []);
   const type = (config?.type as AssertionType) ?? 'contains';
   const value = config?.value as string | undefined;
   const negated = (config?.negated as boolean) ?? false;
diff --git a/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts b/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts
index 5e71b03be..5004381de 100644
--- a/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts
+++ b/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts
@@ -1,7 +1,27 @@
 #!/usr/bin/env bun
 import { defineCodeGrader } from '@agentv/eval';
 
-export default defineCodeGrader(({ outputText }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output }) => {
+  const outputText = getMessageText(output ?? []);
   const lower = outputText.toLowerCase();
   const assertions: Array<{ text: string; passed: boolean }> = [];
 
diff --git a/examples/features/eval-assert-demo/.agentv/graders/length-check.ts b/examples/features/eval-assert-demo/.agentv/graders/length-check.ts
index bdbf31816..da054ff5d 100644
--- a/examples/features/eval-assert-demo/.agentv/graders/length-check.ts
+++ b/examples/features/eval-assert-demo/.agentv/graders/length-check.ts
@@ -1,7 +1,27 @@
 #!/usr/bin/env bun
 import { defineCodeGrader } from '@agentv/eval';
 
-export default defineCodeGrader(({ outputText }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output }) => {
+  const outputText = getMessageText(output ?? []);
   const wordCount = outputText.split(/\s+/).filter(Boolean).length;
   const assertions: Array<{ text: string; passed: boolean }> = [];
 
diff --git a/examples/features/multi-turn-conversation/graders/context-retention.md b/examples/features/multi-turn-conversation/graders/context-retention.md
index d103fa971..9943b2bc2 100644
--- a/examples/features/multi-turn-conversation/graders/context-retention.md
+++ b/examples/features/multi-turn-conversation/graders/context-retention.md
@@ -30,4 +30,4 @@ Your overall `score` should be the average of per-turn scores.
 {{ input }}
 
 [[ ## agent response (final turn) ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/multi-turn-conversation/graders/conversation-relevancy.md b/examples/features/multi-turn-conversation/graders/conversation-relevancy.md
index b051c04c3..d447efbb4 100644
--- a/examples/features/multi-turn-conversation/graders/conversation-relevancy.md
+++ b/examples/features/multi-turn-conversation/graders/conversation-relevancy.md
@@ -31,4 +31,4 @@ Your overall `score` should be the average of per-turn scores.
 {{ input }}
 
 [[ ## agent response (final turn) ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/multi-turn-conversation/graders/role-adherence.md b/examples/features/multi-turn-conversation/graders/role-adherence.md
index 37ba78d8a..6c21c821f 100644
--- a/examples/features/multi-turn-conversation/graders/role-adherence.md
+++ b/examples/features/multi-turn-conversation/graders/role-adherence.md
@@ -32,4 +32,4 @@ Your overall `score` should be the average of per-turn scores.
 {{ input }}
 
 [[ ## agent response (final turn) ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/nlp-metrics/graders/bleu.ts b/examples/features/nlp-metrics/graders/bleu.ts
index 1a139fda1..305dad4a8 100644
--- a/examples/features/nlp-metrics/graders/bleu.ts
+++ b/examples/features/nlp-metrics/graders/bleu.ts
@@ -65,12 +65,28 @@ function bleuScore(candidate: string, reference: string, maxN = 4): number {
   return bp * Math.exp(logSum / count);
 }
 
-export default defineCodeGrader(({ outputText, expectedOutputText, expectedOutput }) => {
-  const reference =
-    expectedOutputText ||
-    (expectedOutput[0] && typeof expectedOutput[0].content === 'string'
-      ? expectedOutput[0].content
-      : '');
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output, expectedOutput }) => {
+  const outputText = getMessageText(output ?? []);
+  const reference = getMessageText(expectedOutput);
 
   if (!reference) {
     return {
diff --git a/examples/features/nlp-metrics/graders/levenshtein.ts b/examples/features/nlp-metrics/graders/levenshtein.ts
index 7db6a4b9b..890b5a02a 100644
--- a/examples/features/nlp-metrics/graders/levenshtein.ts
+++ b/examples/features/nlp-metrics/graders/levenshtein.ts
@@ -31,12 +31,28 @@ function levenshteinDistance(a: string, b: string): number {
   return prev[n] ?? 0;
 }
 
-export default defineCodeGrader(({ outputText, expectedOutputText, expectedOutput }) => {
-  const reference =
-    expectedOutputText ||
-    (expectedOutput[0] && typeof expectedOutput[0].content === 'string'
-      ? expectedOutput[0].content
-      : '');
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output, expectedOutput }) => {
+  const outputText = getMessageText(output ?? []);
+  const reference = getMessageText(expectedOutput);
 
   if (!reference) {
     return {
diff --git a/examples/features/nlp-metrics/graders/rouge.ts b/examples/features/nlp-metrics/graders/rouge.ts
index a4bb9525f..2fedf2f45 100644
--- a/examples/features/nlp-metrics/graders/rouge.ts
+++ b/examples/features/nlp-metrics/graders/rouge.ts
@@ -47,12 +47,28 @@ function rougeN(candidate: string, reference: string, n: number) {
   return { precision, recall, f1 };
 }
 
-export default defineCodeGrader(({ outputText, expectedOutputText, expectedOutput }) => {
-  const reference =
-    expectedOutputText ||
-    (expectedOutput[0] && typeof expectedOutput[0].content === 'string'
-      ? expectedOutput[0].content
-      : '');
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output, expectedOutput }) => {
+  const outputText = getMessageText(output ?? []);
+  const reference = getMessageText(expectedOutput);
 
   if (!reference) {
     return {
diff --git a/examples/features/nlp-metrics/graders/similarity.ts b/examples/features/nlp-metrics/graders/similarity.ts
index ba56a005f..85ba9ed81 100644
--- a/examples/features/nlp-metrics/graders/similarity.ts
+++ b/examples/features/nlp-metrics/graders/similarity.ts
@@ -49,12 +49,28 @@ function jaccardSimilarity(a: Set<string>, b: Set<string>): number {
   return union.size === 0 ? 0 : intersection.size / union.size;
 }
 
-export default defineCodeGrader(({ outputText, expectedOutputText, expectedOutput }) => {
-  const reference =
-    expectedOutputText ||
-    (expectedOutput[0] && typeof expectedOutput[0].content === 'string'
-      ? expectedOutput[0].content
-      : '');
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output, expectedOutput }) => {
+  const outputText = getMessageText(output ?? []);
+  const reference = getMessageText(expectedOutput);
 
   if (!reference) {
     return {
diff --git a/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts b/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts
index 64c40e2cc..d519bde2f 100644
--- a/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts
+++ b/examples/features/prompt-template-sdk/prompts/custom-evaluator.ts
@@ -7,15 +7,36 @@
  */
 import { definePromptTemplate } from '@agentv/eval';
 
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
 export default definePromptTemplate((ctx) => {
+  const inputText = getMessageText(ctx.input, 'user');
+  const outputText = getMessageText(ctx.output ?? []);
+  const expectedOutputText = getMessageText(ctx.expectedOutput);
+
   // Access typed config from YAML
   const rubric = ctx.config?.rubric as string | undefined;
   const strictMode = ctx.config?.strictMode as boolean | undefined;
 
   // Build conditional sections
-  const referenceSection = ctx.expectedOutputText
-    ? `\n## Reference Answer\n${ctx.expectedOutputText}`
-    : '';
+  const referenceSection = expectedOutputText ? `\n## Reference Answer\n${expectedOutputText}` : '';
 
   const rubricSection = rubric ? `\n## Evaluation Rubric\n${rubric}` : '';
 
@@ -26,10 +47,10 @@ export default definePromptTemplate((ctx) => {
   return `You are evaluating an AI assistant's response.
 
 ## Question
-${ctx.inputText}
+${inputText}
 
 ## Candidate Answer
-${ctx.outputText}
+${outputText}
 ${referenceSection}
 ${rubricSection}
 ${strictWarning}
diff --git a/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts b/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts
index d8dc5a14a..2a6443f0c 100644
--- a/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts
+++ b/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts
@@ -1,7 +1,27 @@
 #!/usr/bin/env bun
 import { defineAssertion } from '@agentv/eval';
 
-export default defineAssertion(({ outputText }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineAssertion(({ output }) => {
+  const outputText = getMessageText(output ?? []);
   const wordCount = outputText.trim().split(/\s+/).length;
   const minWords = 3;
   const pass = wordCount >= minWords;
diff --git a/examples/features/weighted-evaluators/prompts/accuracy-check.md b/examples/features/weighted-evaluators/prompts/accuracy-check.md
index 831505b94..c4e55a223 100644
--- a/examples/features/weighted-evaluators/prompts/accuracy-check.md
+++ b/examples/features/weighted-evaluators/prompts/accuracy-check.md
@@ -6,9 +6,9 @@ Evaluate the factual accuracy of the response.
 Verify that the candidate response contains accurate, factual information without errors or misconceptions.
 
 ## Input
-- Question: {{ input_text }}
-- Reference Answer: {{ expected_output_text }}
-- Answer: {{ output_text }}
+- Question: {{ input }}
+- Reference Answer: {{ expected_output }}
+- Answer: {{ output }}
 
 ## Output Format
 Return a JSON object with:
diff --git a/examples/features/weighted-evaluators/prompts/clarity-check.md b/examples/features/weighted-evaluators/prompts/clarity-check.md
index 7ce93e2ba..7d50ff7cc 100644
--- a/examples/features/weighted-evaluators/prompts/clarity-check.md
+++ b/examples/features/weighted-evaluators/prompts/clarity-check.md
@@ -10,9 +10,9 @@ Assess how clear and easy to understand the candidate response is:
 - Avoids unnecessary jargon
 
 ## Input
-- Question: {{ input_text }}
-- Reference Answer: {{ expected_output_text }}
-- Answer: {{ output_text }}
+- Question: {{ input }}
+- Reference Answer: {{ expected_output }}
+- Answer: {{ output }}
 
 ## Output Format
 Return a JSON object with:
diff --git a/examples/features/weighted-evaluators/prompts/completeness-check.md b/examples/features/weighted-evaluators/prompts/completeness-check.md
index 0a7f999e8..f6aac9ee0 100644
--- a/examples/features/weighted-evaluators/prompts/completeness-check.md
+++ b/examples/features/weighted-evaluators/prompts/completeness-check.md
@@ -10,9 +10,9 @@ Assess the completeness of the candidate response:
 - Provides sufficient context
 
 ## Input
-- Question: {{ input_text }}
-- Reference Answer: {{ expected_output_text }}
-- Answer: {{ output_text }}
+- Question: {{ input }}
+- Reference Answer: {{ expected_output }}
+- Answer: {{ output }}
 
 ## Output Format
 Return a JSON object with:
diff --git a/examples/features/weighted-evaluators/prompts/correctness-check.md b/examples/features/weighted-evaluators/prompts/correctness-check.md
index 8917c5e96..4c5fd4a42 100644
--- a/examples/features/weighted-evaluators/prompts/correctness-check.md
+++ b/examples/features/weighted-evaluators/prompts/correctness-check.md
@@ -10,9 +10,9 @@ Assess the correctness of the candidate response, checking for:
 - Technical correctness
 
 ## Input
-- Question: {{ input_text }}
-- Reference Answer: {{ expected_output_text }}
-- Answer: {{ output_text }}
+- Question: {{ input }}
+- Reference Answer: {{ expected_output }}
+- Answer: {{ output }}
 
 ## Output Format
 Return a JSON object with:
diff --git a/examples/features/weighted-evaluators/prompts/experimental-check.md b/examples/features/weighted-evaluators/prompts/experimental-check.md
index 9f26efb88..1ff680238 100644
--- a/examples/features/weighted-evaluators/prompts/experimental-check.md
+++ b/examples/features/weighted-evaluators/prompts/experimental-check.md
@@ -6,9 +6,9 @@ An experimental evaluator for collecting additional metrics without affecting sc
 This is an experimental evaluator used to test new evaluation criteria. Assess the response based on novel or experimental quality dimensions.
 
 ## Input
-- Question: {{ input_text }}
-- Reference Answer: {{ expected_output_text }}
-- Answer: {{ output_text }}
+- Question: {{ input }}
+- Reference Answer: {{ expected_output }}
+- Answer: {{ output }}
 
 ## Output Format
 Return a JSON object with:
diff --git a/examples/features/weighted-evaluators/prompts/quality-evaluation.md b/examples/features/weighted-evaluators/prompts/quality-evaluation.md
index c339deb7b..1e9933800 100644
--- a/examples/features/weighted-evaluators/prompts/quality-evaluation.md
+++ b/examples/features/weighted-evaluators/prompts/quality-evaluation.md
@@ -10,9 +10,9 @@ Assess the quality of the candidate response based on:
 - Relevance to the question
 
 ## Input
-- Question: {{ input_text }}
-- Reference Answer: {{ expected_output_text }}
-- Answer: {{ output_text }}
+- Question: {{ input }}
+- Reference Answer: {{ expected_output }}
+- Answer: {{ output }}
 
 ## Output Format
 Return a JSON object with:
diff --git a/examples/features/weighted-evaluators/prompts/safety-check.md b/examples/features/weighted-evaluators/prompts/safety-check.md
index 49601545b..621ce64eb 100644
--- a/examples/features/weighted-evaluators/prompts/safety-check.md
+++ b/examples/features/weighted-evaluators/prompts/safety-check.md
@@ -10,9 +10,9 @@ Review the candidate response for any safety concerns including:
 - Privacy violations
 
 ## Input
-- Question: {{ input_text }}
-- Reference Answer: {{ expected_output_text }}
-- Answer: {{ output_text }}
+- Question: {{ input }}
+- Reference Answer: {{ expected_output }}
+- Answer: {{ output }}
 
 ## Output Format
 Return a JSON object with:
diff --git a/examples/features/weighted-evaluators/prompts/style-evaluation.md b/examples/features/weighted-evaluators/prompts/style-evaluation.md
index 1bb535e11..314c3b54d 100644
--- a/examples/features/weighted-evaluators/prompts/style-evaluation.md
+++ b/examples/features/weighted-evaluators/prompts/style-evaluation.md
@@ -10,9 +10,9 @@ Assess the style and presentation of the candidate response based on:
 - Use of examples and analogies
 
 ## Input
-- Question: {{ input_text }}
-- Reference Answer: {{ expected_output_text }}
-- Answer: {{ output_text }}
+- Question: {{ input }}
+- Reference Answer: {{ expected_output }}
+- Answer: {{ output }}
 
 ## Output Format
 Return a JSON object with:
diff --git a/examples/showcase/evaluator-conformance/evaluators/keyword-grader.ts b/examples/showcase/evaluator-conformance/evaluators/keyword-grader.ts
index de49a0d11..7612499b1 100644
--- a/examples/showcase/evaluator-conformance/evaluators/keyword-grader.ts
+++ b/examples/showcase/evaluator-conformance/evaluators/keyword-grader.ts
@@ -8,7 +8,28 @@
  */
 import { defineCodeGrader } from '@agentv/eval';
 
-export default defineCodeGrader(({ outputText, expectedOutputText, criteria }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output, expectedOutput, criteria }) => {
+  const outputText = getMessageText(output ?? []);
+  const expectedOutputText = getMessageText(expectedOutput);
   const candidate = (outputText ?? '').toLowerCase().trim();
   const expected = (expectedOutputText ?? '').toLowerCase().trim();
 
diff --git a/examples/showcase/export-screening/evals/validate_risk_output.ts b/examples/showcase/export-screening/evals/validate_risk_output.ts
index a1ce9a8ca..8f98895d0 100644
--- a/examples/showcase/export-screening/evals/validate_risk_output.ts
+++ b/examples/showcase/export-screening/evals/validate_risk_output.ts
@@ -59,7 +59,27 @@ function extractExpectedRiskLevel(
   return null;
 }
 
-export default defineCodeGrader(({ outputText, expectedOutput }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output, expectedOutput }) => {
+  const outputText = getMessageText(output ?? []);
   const assertions: Array<{ text: string; passed: boolean; evidence?: string }> = [];
 
   // Parse candidate JSON
diff --git a/examples/showcase/multi-model-benchmark/prompts/accuracy-rubric.md b/examples/showcase/multi-model-benchmark/prompts/accuracy-rubric.md
index 927f79997..d4c6bc8e3 100644
--- a/examples/showcase/multi-model-benchmark/prompts/accuracy-rubric.md
+++ b/examples/showcase/multi-model-benchmark/prompts/accuracy-rubric.md
@@ -8,9 +8,9 @@ Assess whether the candidate response is factually correct and aligns with the r
 
 ## Input
 
-- Question: {{ input_text }}
-- Reference Answer: {{ expected_output_text }}
-- Answer: {{ output_text }}
+- Question: {{ input }}
+- Reference Answer: {{ expected_output }}
+- Answer: {{ output }}
 
 ## Scoring
 
diff --git a/examples/showcase/multi-model-benchmark/prompts/clarity-rubric.md b/examples/showcase/multi-model-benchmark/prompts/clarity-rubric.md
index 3a784b3f9..96b91c994 100644
--- a/examples/showcase/multi-model-benchmark/prompts/clarity-rubric.md
+++ b/examples/showcase/multi-model-benchmark/prompts/clarity-rubric.md
@@ -8,9 +8,9 @@ Assess whether the candidate response is clear, well-structured, and easy to und
 
 ## Input
 
-- Question: {{ input_text }}
-- Reference Answer: {{ expected_output_text }}
-- Answer: {{ output_text }}
+- Question: {{ input }}
+- Reference Answer: {{ expected_output }}
+- Answer: {{ output }}
 
 ## Scoring
 
diff --git a/examples/showcase/multi-model-benchmark/prompts/completeness-rubric.md b/examples/showcase/multi-model-benchmark/prompts/completeness-rubric.md
index 6da863f4d..0ae9a76f1 100644
--- a/examples/showcase/multi-model-benchmark/prompts/completeness-rubric.md
+++ b/examples/showcase/multi-model-benchmark/prompts/completeness-rubric.md
@@ -8,9 +8,9 @@ Assess whether the candidate response addresses every part of the question and i
 
 ## Input
 
-- Question: {{ input_text }}
-- Reference Answer: {{ expected_output_text }}
-- Answer: {{ output_text }}
+- Question: {{ input }}
+- Reference Answer: {{ expected_output }}
+- Answer: {{ output }}
 
 ## Scoring
 
diff --git a/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v1.md b/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v1.md
index 2a2e224f9..13b6d57b5 100644
--- a/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v1.md
+++ b/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v1.md
@@ -4,9 +4,9 @@ Read the task/context in `question`, then read the candidate response in `answer
 Ignore any human labels or reference answers. Your only job is to decide whether the candidate response should PASS or FAIL against the rubric in `criteria`.
 
 ## Inputs
-- Task and context: {{input_text}}
+- Task and context: {{ input }}
 - Rubric: {{criteria}}
-- Candidate response: {{output_text}}
+- Candidate response: {{ output }}
 
 ## Output rules
 - Return score `1.0` when the response should PASS.
diff --git a/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v2.md b/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v2.md
index 6c3b42f64..f48bbc824 100644
--- a/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v2.md
+++ b/examples/showcase/offline-grader-benchmark/prompts/grader-pass-fail-v2.md
@@ -3,13 +3,13 @@ You are one member of a three-model grader panel.
 Evaluate the frozen agent response strictly from the task/context and rubric. Do not use hidden labels, reference answers, or speculate about the dataset author.
 
 ## Task + context
-{{input_text}}
+{{ input }}
 
 ## Rubric
 {{criteria}}
 
 ## Frozen response under review
-{{output_text}}
+{{ output }}
 
 ## Decision policy
 1. PASS only if the response satisfies the required policy constraints.
diff --git a/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts b/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts
index f19549310..b610470d5 100644
--- a/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts
+++ b/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts
@@ -100,9 +100,28 @@ function compareResponses(
   return { winner: 'TIE', aAdvantages, bAdvantages };
 }
 
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
 export default defineCodeGrader((input) => {
-  const candidate = input.outputText ?? '';
-  const reference = input.expectedOutputText ?? '';
+  const candidate = getMessageText(input.output ?? []);
+  const reference = getMessageText(input.expectedOutput);
 
   // If no reference, we can't do pairwise comparison
   if (!reference) {
@@ -113,7 +132,7 @@ export default defineCodeGrader((input) => {
         {
           text: 'No reference for comparison',
           passed: false,
-          evidence: 'Pairwise comparison requires expectedOutputText field',
+          evidence: 'Pairwise comparison requires expected output messages',
         },
       ],
     };
diff --git a/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts b/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts
index 7dc3dba51..e9b694874 100644
--- a/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts
+++ b/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts
@@ -49,7 +49,27 @@ const toolTaskMappings: Record<string, string[]> = {
   validate: ['check', 'validate', 'verify', 'confirm'],
 };
 
-export default defineCodeGrader(({ inputText, criteria, output }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ input, criteria, output }) => {
+  const inputText = getMessageText(input, 'user');
   const assertions: Array<{ text: string; passed: boolean }> = [];
 
   const toolCalls = extractToolCalls(output ?? []);
diff --git a/packages/core/src/evaluation/evaluators/code-evaluator.ts b/packages/core/src/evaluation/evaluators/code-evaluator.ts
index a1cecc08c..c2410924b 100644
--- a/packages/core/src/evaluation/evaluators/code-evaluator.ts
+++ b/packages/core/src/evaluation/evaluators/code-evaluator.ts
@@ -64,7 +64,6 @@ export class CodeEvaluator implements Evaluator {
     const payload = {
       criteria: context.evalCase.criteria,
       expectedOutput: context.evalCase.expected_output,
-      outputText: context.candidate,
       output: outputForPayload,
       outputPath,
       inputFiles: context.evalCase.file_paths,
@@ -78,8 +77,6 @@ export class CodeEvaluator implements Evaluator {
       fileChanges: context.fileChanges ?? null,
       workspacePath: context.workspacePath ?? null,
       config: this.config ?? null,
-      inputText: context.evalCase.question,
-      expectedOutputText: context.evalCase.reference_answer ?? '',
     };
 
     const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
diff --git a/packages/core/src/evaluation/evaluators/llm-grader-prompt.ts b/packages/core/src/evaluation/evaluators/llm-grader-prompt.ts
index 1a3d26bee..b8d80feff 100644
--- a/packages/core/src/evaluation/evaluators/llm-grader-prompt.ts
+++ b/packages/core/src/evaluation/evaluators/llm-grader-prompt.ts
@@ -68,11 +68,12 @@ function assembleFreeform(
       : evalCase.question;
 
   const variables = {
-    [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
-    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
-    [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
+    [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+    [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
+    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? '').trim(),
     [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
     [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? '',
+    // Deprecated aliases
     [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
     [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
     [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? '').trim(),
diff --git a/packages/core/src/evaluation/evaluators/llm-grader.ts b/packages/core/src/evaluation/evaluators/llm-grader.ts
index abd550398..553f06774 100644
--- a/packages/core/src/evaluation/evaluators/llm-grader.ts
+++ b/packages/core/src/evaluation/evaluators/llm-grader.ts
@@ -6,7 +6,7 @@ import { z } from 'zod';
 
 import type { Provider, ProviderResponse } from '../providers/types.js';
 import { extractLastAssistantContent, isAgentProvider } from '../providers/types.js';
-import { TEMPLATE_VARIABLES } from '../template-variables.js';
+import { DEPRECATED_TEMPLATE_VARIABLES, TEMPLATE_VARIABLES } from '../template-variables.js';
 import type { TokenUsage } from '../trace.js';
 import type { AssertionEntry, JsonObject, RubricItem } from '../types.js';
 import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js';
@@ -74,13 +74,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
 {{${TEMPLATE_VARIABLES.CRITERIA}}}
 
 [[ ## question ## ]]
-{{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
+{{${TEMPLATE_VARIABLES.INPUT}}}
 
 [[ ## reference_answer ## ]]
-{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
+{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT}}}
 
 [[ ## answer ## ]]
-{{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
+{{${TEMPLATE_VARIABLES.OUTPUT}}}`;
 
 type GraderProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
 
@@ -206,17 +206,15 @@ export class LlmGraderEvaluator implements Evaluator {
         ? context.promptInputs.question
         : context.evalCase.question;
 
-    // Prepare template variables for substitution
+    // Prepare template variables for substitution.
+    // Primary variables resolve to human-readable text; deprecated _text aliases map to the same values.
     const variables = {
-      [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input, null, 2),
-      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
-        context.evalCase.expected_output,
-        null,
-        2,
-      ),
-      [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context.output ?? [], null, 2),
+      [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+      [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
+      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(),
       [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
       [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
+      // Deprecated aliases — same values as the primary variables above
       [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
       [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
       [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(),
@@ -228,6 +226,10 @@ export class LlmGraderEvaluator implements Evaluator {
     // Build user prompt based on custom template or default template
     const evaluatorTemplate =
       context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
+
+    // Warn once per run when custom templates use deprecated _text variable names
+    warnDeprecatedTemplateVars(evaluatorTemplate);
+
     let userPrompt = substituteVariables(evaluatorTemplate, variables);
 
     // Append file_changes section to default template only when present
@@ -615,13 +617,18 @@ export class LlmGraderEvaluator implements Evaluator {
 
     const variables: Record<string, string> = {
       [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
+      [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+      [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
+      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(),
+      [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
+      // Deprecated aliases
       [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
       [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
       [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(),
-      [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
     };
 
     if (this.evaluatorTemplate) {
+      warnDeprecatedTemplateVars(this.evaluatorTemplate);
       return substituteVariables(this.evaluatorTemplate, variables);
     }
 
@@ -685,11 +692,16 @@ export class LlmGraderEvaluator implements Evaluator {
     if (this.evaluatorTemplate) {
       const variables: Record<string, string> = {
         [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
+        [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+        [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
+        [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(),
+        [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
+        // Deprecated aliases
         [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
         [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
         [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? '').trim(),
-        [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
       };
+      warnDeprecatedTemplateVars(this.evaluatorTemplate);
       const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
 
       const outputSchema =
@@ -1018,6 +1030,34 @@ export function substituteVariables(template: string, variables: Record<string,
   });
 }
 
+const ANSI_YELLOW = '\u001b[33m';
+const ANSI_RESET = '\u001b[0m';
+
+/** Set of templates that have already emitted a deprecation warning (one-time per template). */
+const warnedTemplateStrings = new Set<string>();
+
+/**
+ * Emit a one-time stderr warning when a template uses deprecated _text variable names.
+ * Skips the default template (which uses the new names and should never trigger warnings).
+ */
+export function warnDeprecatedTemplateVars(template: string): void {
+  if (warnedTemplateStrings.has(template)) return;
+
+  const used: string[] = [];
+  for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
+    if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
+      used.push(`{{ ${deprecated} }} → {{ ${replacement} }}`);
+    }
+  }
+
+  if (used.length > 0) {
+    warnedTemplateStrings.add(template);
+    console.warn(
+      `${ANSI_YELLOW}⚠ Deprecated template variables detected (they still work but will be removed in a future version):\n  ${used.join('\n  ')}\n  Update your custom evaluator template to use the new names.${ANSI_RESET}`,
+    );
+  }
+}
+
 export function calculateRubricScore(
   result: z.infer<typeof rubricEvaluationSchema>,
   rubrics: readonly RubricItem[],
diff --git a/packages/core/src/evaluation/evaluators/prompt-resolution.ts b/packages/core/src/evaluation/evaluators/prompt-resolution.ts
index 7c20387d5..5429e62ab 100644
--- a/packages/core/src/evaluation/evaluators/prompt-resolution.ts
+++ b/packages/core/src/evaluation/evaluators/prompt-resolution.ts
@@ -75,7 +75,6 @@ async function executePromptTemplate(
   const payload = {
     criteria: context.evalCase.criteria,
     expectedOutput: context.evalCase.expected_output,
-    outputText: context.candidate,
     output: context.output ?? null,
     inputFiles: context.evalCase.file_paths,
     input: context.evalCase.input,
@@ -83,8 +82,6 @@ async function executePromptTemplate(
     fileChanges: context.fileChanges ?? null,
     workspacePath: context.workspacePath ?? null,
     config: config ?? context.config ?? null,
-    inputText: context.evalCase.question,
-    expectedOutputText: context.evalCase.reference_answer ?? '',
   };
 
   const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
diff --git a/packages/core/src/evaluation/template-variables.ts b/packages/core/src/evaluation/template-variables.ts
index a429a2c11..31d289145 100644
--- a/packages/core/src/evaluation/template-variables.ts
+++ b/packages/core/src/evaluation/template-variables.ts
@@ -1,6 +1,18 @@
 /**
  * Template variable constants for evaluator prompts.
  * These variables can be used in custom evaluator templates with {{ variable_name }} syntax.
+ *
+ * Primary variables:
+ *   - {{ input }}           — input as plain text (single-turn) or role-prefixed conversation (multi-turn)
+ *   - {{ output }}          — last assistant message as plain text
+ *   - {{ expected_output }} — reference answer as plain text
+ *   - {{ criteria }}        — evaluation criteria string
+ *   - {{ file_changes }}    — file diff (if available)
+ *
+ * Deprecated aliases (emit a warning when used in custom templates):
+ *   - {{ input_text }}           → use {{ input }}
+ *   - {{ output_text }}          → use {{ output }}
+ *   - {{ expected_output_text }} → use {{ expected_output }}
  */
 export const TEMPLATE_VARIABLES = {
   EXPECTED_OUTPUT: 'expected_output',
@@ -8,8 +20,11 @@ export const TEMPLATE_VARIABLES = {
   INPUT: 'input',
   OUTPUT: 'output',
   FILE_CHANGES: 'file_changes',
+  /** @deprecated Use INPUT instead — resolves to the same text value. */
   INPUT_TEXT: 'input_text',
+  /** @deprecated Use OUTPUT instead — resolves to the same text value. */
   OUTPUT_TEXT: 'output_text',
+  /** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
   EXPECTED_OUTPUT_TEXT: 'expected_output_text',
 } as const;
 
@@ -28,6 +43,16 @@ export const VALID_TEMPLATE_VARIABLES = new Set<string>(Object.values(TEMPLATE_V
  * At least one of these should be present in a custom evaluator template.
  */
 export const REQUIRED_TEMPLATE_VARIABLES = new Set<string>([
-  TEMPLATE_VARIABLES.OUTPUT_TEXT,
+  TEMPLATE_VARIABLES.OUTPUT,
   TEMPLATE_VARIABLES.EXPECTED_OUTPUT,
 ]);
+
+/**
+ * Deprecated template variable names that still work but trigger a warning.
+ * Maps deprecated name → replacement name.
+ */
+export const DEPRECATED_TEMPLATE_VARIABLES: ReadonlyMap<string, string> = new Map([
+  [TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
+  [TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
+  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT],
+]);
diff --git a/packages/core/src/evaluation/validation/prompt-validator.ts b/packages/core/src/evaluation/validation/prompt-validator.ts
index 9f7ccf914..8f8101809 100644
--- a/packages/core/src/evaluation/validation/prompt-validator.ts
+++ b/packages/core/src/evaluation/validation/prompt-validator.ts
@@ -1,6 +1,10 @@
 import { readFile } from 'node:fs/promises';
 
-import { TEMPLATE_VARIABLES, VALID_TEMPLATE_VARIABLES } from '../template-variables.js';
+import {
+  DEPRECATED_TEMPLATE_VARIABLES,
+  TEMPLATE_VARIABLES,
+  VALID_TEMPLATE_VARIABLES,
+} from '../template-variables.js';
 
 const ANSI_YELLOW = '\u001b[33m';
 const ANSI_RESET = '\u001b[0m';
@@ -36,15 +40,33 @@ export function validateTemplateVariables(content: string, source: string): void
     match = variablePattern.exec(content);
   }
 
-  // Check if template contains required variables for evaluation
-  const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
-  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
+  // Check if template contains required variables for evaluation.
+  // Accept both new names (output, expected_output) and deprecated aliases (output_text, expected_output_text).
+  const hasCandidateAnswer =
+    foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) ||
+    foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
+  const hasExpectedOutput =
+    foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT) ||
+    foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT);
   const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
 
   // ERROR: Missing required fields - throw error to skip this evaluator/eval case
   if (!hasRequiredFields) {
     throw new Error(
-      `Missing required fields. Must include at least one of:\n  - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}\n  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`,
+      `Missing required fields. Must include at least one of:\n  - {{ ${TEMPLATE_VARIABLES.OUTPUT} }}\n  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`,
+    );
+  }
+
+  // WARNING: Deprecated variables - show warning but continue
+  const deprecatedUsed: string[] = [];
+  for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
+    if (foundVariables.has(deprecated)) {
+      deprecatedUsed.push(`{{ ${deprecated} }} → {{ ${replacement} }}`);
+    }
+  }
+  if (deprecatedUsed.length > 0) {
+    console.warn(
+      `${ANSI_YELLOW}Warning: Template at ${source} uses deprecated variable names:\n  ${deprecatedUsed.join('\n  ')}\n  These still work but will be removed in a future version.${ANSI_RESET}`,
     );
   }
 
diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts
index df7ef94b8..a7b92afa4 100644
--- a/packages/core/test/evaluation/evaluators.test.ts
+++ b/packages/core/test/evaluation/evaluators.test.ts
@@ -715,6 +715,7 @@ describe('CodeEvaluator', () => {
     const result = await evaluator.evaluate({
       evalCase: evalCaseWithExpectedMessages,
       candidate: expectedCandidate,
+      output: [{ role: 'assistant', content: '{"decision":"ACCEPT"}' }],
       target: baseTarget,
       provider: graderProvider,
       attempt: 0,
@@ -765,6 +766,7 @@ describe('CodeEvaluator', () => {
     const result = await evaluator.evaluate({
       evalCase: baseTestCase,
       candidate: 'Added logging to the implementation',
+      output: [{ role: 'assistant', content: 'Added logging to the implementation' }],
       target: baseTarget,
       provider: graderProvider,
       attempt: 0,
@@ -791,6 +793,7 @@ describe('CodeEvaluator', () => {
         expected_output: [{ role: 'assistant', content: 'test' }],
       },
       candidate: 'Test candidate',
+      output: [{ role: 'assistant', content: 'Test candidate' }],
       target: baseTarget,
       provider: graderProvider,
       attempt: 0,
@@ -848,6 +851,7 @@ describe('CodeEvaluator', () => {
         expected_output: [{ role: 'assistant', content: { decision: 'ACCEPT' } }],
       },
       candidate: '{"decision":"ACCEPT"}',
+      output: [{ role: 'assistant', content: '{"decision":"ACCEPT"}' }],
       target: baseTarget,
       provider: graderProvider,
       attempt: 0,
diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts
index 6b47e2941..5eeda30de 100644
--- a/packages/core/test/evaluation/evaluators_variables.test.ts
+++ b/packages/core/test/evaluation/evaluators_variables.test.ts
@@ -45,12 +45,10 @@ describe('LlmGraderEvaluator Variable Substitution', () => {
   it('substitutes template variables in custom prompt', async () => {
     const formattedQuestion = '@[User]: What is the status?\n\n@[Assistant]: Requesting more info.';
     const customPrompt = `
-Question: {{input_text}}
+Question: {{input}}
 Outcome: {{criteria}}
-Reference: {{expected_output_text}}
-Candidate: {{output_text}}
-Input Messages: {{input}}
-Expected Messages: {{expected_output}}
+Reference: {{expected_output}}
+Candidate: {{output}}
 File Changes: {{file_changes}}
 `;
 
@@ -82,22 +80,13 @@ File Changes: {{file_changes}}
     const request = graderProvider.lastRequest;
     expect(request).toBeDefined();
 
-    // When custom evaluatorTemplate is provided, it goes in the user prompt (question)
-    // System prompt only contains the output schema
+    // Primary variables resolve to human-readable text
     expect(request?.question).toContain(`Question: ${formattedQuestion}`);
     expect(request?.question).not.toContain('Original Question Text');
     expect(request?.question).toContain('Outcome: Expected Outcome Text');
     expect(request?.question).toContain('Reference: Reference Answer Text');
     expect(request?.question).toContain('Candidate: Candidate Answer Text');
 
-    // Verify input JSON stringification
-    expect(request?.question).toContain('Input Messages: [');
-    expect(request?.question).toContain('"value": "Input Message"');
-
-    // Verify expected_output JSON stringification
-    expect(request?.question).toContain('Expected Messages: [');
-    expect(request?.question).toContain('"value": "Expected Output Message"');
-
     // Verify file_changes substitution
     expect(request?.question).toContain('File Changes: diff --git a/test.txt b/test.txt');
     expect(request?.question).toContain('+added line');
@@ -107,6 +96,45 @@ File Changes: {{file_changes}}
     expect(request?.systemPrompt).not.toContain(`Question: ${formattedQuestion}`);
   });
 
+  it('deprecated _text aliases still resolve correctly', async () => {
+    const formattedQuestion = 'What is 2+2?';
+    const customPrompt = `
+Question: {{input_text}}
+Reference: {{expected_output_text}}
+Candidate: {{output_text}}
+`;
+
+    const graderProvider = new CapturingProvider({
+      text: JSON.stringify({
+        score: 0.9,
+        assertions: [{ text: 'OK', passed: true }],
+      }),
+    });
+
+    const evaluator = new LlmGraderEvaluator({
+      resolveGraderProvider: async () => graderProvider,
+      evaluatorTemplate: customPrompt,
+    });
+
+    await evaluator.evaluate({
+      evalCase: { ...baseTestCase, evaluator: 'llm-grader' },
+      candidate: 'Four',
+      target: baseTarget,
+      provider: graderProvider,
+      attempt: 0,
+      promptInputs: { question: formattedQuestion },
+      now: new Date(),
+    });
+
+    const request = graderProvider.lastRequest;
+    expect(request).toBeDefined();
+
+    // Deprecated aliases resolve to the same text values as the primary variables
+    expect(request?.question).toContain(`Question: ${formattedQuestion}`);
+    expect(request?.question).toContain('Reference: Reference Answer Text');
+    expect(request?.question).toContain('Candidate: Four');
+  });
+
   it('does not substitute if no variables are present', async () => {
     const customPrompt = 'Fixed prompt without variables';
     const promptQuestion = 'Summarize the latest logs without markers.';
@@ -143,12 +171,10 @@ File Changes: {{file_changes}}
   it('substitutes template variables with whitespace inside braces', async () => {
     const formattedQuestion = 'What is the status?';
     const customPrompt = `
-Question: {{ input_text }}
+Question: {{ input }}
 Outcome: {{ criteria }}
-Reference: {{ expected_output_text }}
-Candidate: {{ output_text }}
-Input Messages: {{ input }}
-Expected Messages: {{ expected_output }}
+Reference: {{ expected_output }}
+Candidate: {{ output }}
 `;
 
     const graderProvider = new CapturingProvider({
@@ -184,12 +210,6 @@ Expected Messages: {{ expected_output }}
     expect(request?.question).toContain('Reference: Reference Answer Text');
     expect(request?.question).toContain('Candidate: Candidate Answer Text');
 
-    // Verify JSON stringified variables were also substituted
-    expect(request?.question).toContain('Input Messages: [');
-    expect(request?.question).toContain('"value": "Input Message"');
-    expect(request?.question).toContain('Expected Messages: [');
-    expect(request?.question).toContain('"value": "Expected Output Message"');
-
     // Verify no unreplaced template markers remain
     expect(request?.question).not.toMatch(/\{\{\s*\w+\s*\}\}/);
   });
diff --git a/packages/core/test/evaluation/loaders/evaluator-parser.test.ts b/packages/core/test/evaluation/loaders/evaluator-parser.test.ts
index 0e76f15d3..0c56c86a5 100644
--- a/packages/core/test/evaluation/loaders/evaluator-parser.test.ts
+++ b/packages/core/test/evaluation/loaders/evaluator-parser.test.ts
@@ -1614,8 +1614,8 @@ describe('parseEvaluators - composite assertions field', () => {
     tempDir = path.join(os.tmpdir(), `agentv-test-composite-assert-${Date.now()}`);
     await mkdir(tempDir, { recursive: true });
     // Create dummy prompt files for llm-grader members (must include required template fields)
-    await writeFile(path.join(tempDir, 'safety.md'), 'Evaluate safety of {{ output_text }}');
-    await writeFile(path.join(tempDir, 'quality.md'), 'Evaluate quality of {{ output_text }}');
+    await writeFile(path.join(tempDir, 'safety.md'), 'Evaluate safety of {{ output }}');
+    await writeFile(path.join(tempDir, 'quality.md'), 'Evaluate quality of {{ output }}');
   });
 
   afterAll(async () => {
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index fed09e1db..fb46f8fe9 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -975,9 +975,12 @@ describe('runEvalCase trace integration', () => {
         `import { readFileSync } from 'fs';
 const stdin = readFileSync(0, 'utf8');
 const input = JSON.parse(stdin);
-console.log(\`Question: \${input.input_text}
-Answer: \${input.output_text}
-Reference: \${input.expected_output_text ?? 'none'}\`);
+const question = (input.input || []).map((m) => String(m.content ?? '')).join('\\n');
+const answer = (input.output || []).map((m) => String(m.content ?? '')).join('\\n');
+const ref = (input.expected_output || []).map((m) => String(m.content ?? '')).join('\\n') || 'none';
+console.log(\`Question: \${question}
+Answer: \${answer}
+Reference: \${ref}\`);
 `,
       );
 
@@ -1009,7 +1012,9 @@ Reference: \${input.expected_output_text ?? 'none'}\`);
         evalCase: {
           ...baseTestCase,
           question: 'What is 2+2?',
+          input: [{ role: 'user', content: 'What is 2+2?' }],
           reference_answer: 'The sum is 4',
+          expected_output: [{ role: 'assistant', content: 'The sum is 4' }],
           assertions: [
             {
               name: 'ts-prompt-eval',
@@ -1040,7 +1045,9 @@ Reference: \${input.expected_output_text ?? 'none'}\`);
         `const fs = require('fs');
 const stdin = fs.readFileSync(0, 'utf8');
 const input = JSON.parse(stdin);
-console.log('Question: ' + input.input_text + '\\nAnswer: ' + input.output_text);
+const question = (input.input || []).map((m) => String(m.content || '')).join('\\n');
+const answer = (input.output || []).map((m) => String(m.content || '')).join('\\n');
+console.log('Question: ' + question + '\\nAnswer: ' + answer);
 `,
       );
 
@@ -1070,6 +1077,7 @@ console.log('Question: ' + input.input_text + '\\nAnswer: ' + input.output_text)
         evalCase: {
           ...baseTestCase,
           question: 'Test question',
+          input: [{ role: 'user', content: 'Test question' }],
           assertions: [
             {
               name: 'js-prompt-eval',
diff --git a/packages/core/test/evaluation/validation/prompt-validator.test.ts b/packages/core/test/evaluation/validation/prompt-validator.test.ts
new file mode 100644
index 000000000..9a189c953
--- /dev/null
+++ b/packages/core/test/evaluation/validation/prompt-validator.test.ts
@@ -0,0 +1,37 @@
+import { describe, expect, it } from 'vitest';
+
+import { validateTemplateVariables } from '../../../src/evaluation/validation/prompt-validator.js';
+
+describe('validateTemplateVariables', () => {
+  it('passes when template contains {{ output }}', () => {
+    expect(() => validateTemplateVariables('Score: {{ output }}', 'test.txt')).not.toThrow();
+  });
+
+  it('passes when template contains {{ expected_output }}', () => {
+    expect(() =>
+      validateTemplateVariables('Reference: {{ expected_output }}', 'test.txt'),
+    ).not.toThrow();
+  });
+
+  it('passes when template contains deprecated {{ output_text }}', () => {
+    expect(() => validateTemplateVariables('Score: {{ output_text }}', 'test.txt')).not.toThrow();
+  });
+
+  it('passes when template contains deprecated {{ expected_output_text }}', () => {
+    expect(() =>
+      validateTemplateVariables('Reference: {{ expected_output_text }}', 'test.txt'),
+    ).not.toThrow();
+  });
+
+  it('throws when no required or deprecated variables are present', () => {
+    expect(() => validateTemplateVariables('No variables here', 'test.txt')).toThrow(
+      'Missing required fields',
+    );
+  });
+
+  it('throws when only non-required variables are present', () => {
+    expect(() =>
+      validateTemplateVariables('Input: {{ input }} Criteria: {{ criteria }}', 'test.txt'),
+    ).toThrow('Missing required fields');
+  });
+});
diff --git a/packages/core/test/fixtures/test-define-grader.ts b/packages/core/test/fixtures/test-define-grader.ts
index 820d48bc1..f5c41f75d 100644
--- a/packages/core/test/fixtures/test-define-grader.ts
+++ b/packages/core/test/fixtures/test-define-grader.ts
@@ -4,12 +4,15 @@
  */
 import { defineCodeGrader } from '../../../eval/src/index.js';
 
-export default defineCodeGrader(({ outputText, criteria }) => {
+export default defineCodeGrader(({ output, criteria }) => {
   const assertions: { text: string; passed: boolean }[] = [];
 
+  // Extract text from the output message array
+  const candidateText = (output ?? []).map((m) => String(m.content ?? '')).join(' ');
+
   // Simple check: does candidate mention the criteria keywords?
   const outcomeWords = criteria.toLowerCase().split(/\s+/);
-  const candidateWords = outputText.toLowerCase().split(/\s+/);
+  const candidateWords = candidateText.toLowerCase().split(/\s+/);
 
   for (const word of outcomeWords) {
     if (word.length > 3 && candidateWords.includes(word)) {
diff --git a/packages/core/test/fixtures/test-grader-with-details.cjs b/packages/core/test/fixtures/test-grader-with-details.cjs
index e3ce45923..b11c34d36 100644
--- a/packages/core/test/fixtures/test-grader-with-details.cjs
+++ b/packages/core/test/fixtures/test-grader-with-details.cjs
@@ -7,7 +7,10 @@ const fs = require('node:fs');
 const input = JSON.parse(fs.readFileSync(0, 'utf8'));
 
 const hasExpected = Array.isArray(input.expected_output);
-const hasCandidate = typeof input.output_text === 'string';
+// Extract candidate text from the output message array
+const outputMessages = Array.isArray(input.output) ? input.output : [];
+const candidateText = outputMessages.map((m) => String(m.content ?? '')).join('');
+const hasCandidate = candidateText.length > 0;
 
 // Emit details with structured metrics
 console.log(
diff --git a/packages/core/test/fixtures/test-grader.cjs b/packages/core/test/fixtures/test-grader.cjs
index 4b049b1c2..e341fb69f 100644
--- a/packages/core/test/fixtures/test-grader.cjs
+++ b/packages/core/test/fixtures/test-grader.cjs
@@ -4,11 +4,16 @@ const fs = require('node:fs');
 const input = JSON.parse(fs.readFileSync(0, 'utf8'));
 
 const hasExpected = Array.isArray(input.expected_output);
-const hasCandidate = typeof input.output_text === 'string';
+// Extract candidate text from the output message array
+const outputMessages = Array.isArray(input.output) ? input.output : [];
+const candidateText = outputMessages
+  .map((m) => (typeof m.content === 'string' ? m.content : JSON.stringify(m.content)))
+  .join('');
+const hasCandidate = candidateText.length > 0;
 let candidateDecisionOk = false;
 
 try {
-  const obj = JSON.parse(input.output_text);
+  const obj = JSON.parse(candidateText);
   candidateDecisionOk = obj && obj.decision === 'ACCEPT';
 } catch {}
 
diff --git a/packages/eval/src/assertion.ts b/packages/eval/src/assertion.ts
index e69e9e625..1d654f329 100644
--- a/packages/eval/src/assertion.ts
+++ b/packages/eval/src/assertion.ts
@@ -14,17 +14,12 @@ import {
   CodeGraderInputSchema,
   type CodeGraderResult,
   CodeGraderResultSchema,
-  type EnrichedCodeGraderInput,
 } from './schemas.js';
 
 /**
  * Context provided to assertion handlers.
- *
- * Same shape as CodeGraderInput but with `inputText`, `outputText`, and
- * `expectedOutputText` guaranteed to be strings (populated by the runtime
- * before the handler is called).
  */
-export type AssertionContext = EnrichedCodeGraderInput;
+export type AssertionContext = CodeGraderInput;
 
 /**
  * Known built-in assertion types. Custom types are extensible via string.
@@ -193,11 +188,11 @@ export async function runAssertion(handler: AssertionHandler): Promise<void> {
       });
     }
 
-    // Enrich input with text accessors and deprecation warnings
+    // Enrich input — no-op pass-through
     enrichInput(input);
 
-    // After enrichment, text accessors are guaranteed to be strings
-    const rawResult = await handler(input as EnrichedCodeGraderInput);
+    // Run handler
+    const rawResult = await handler(input);
     const normalized = normalizeScore(rawResult);
     const result = CodeGraderResultSchema.parse(normalized);
     console.log(JSON.stringify(result, null, 2));
diff --git a/packages/eval/src/deprecation.ts b/packages/eval/src/deprecation.ts
index 735cdc508..35d80939f 100644
--- a/packages/eval/src/deprecation.ts
+++ b/packages/eval/src/deprecation.ts
@@ -1,26 +1,20 @@
 /**
  * Input enrichment utilities for code grader and assertion runtimes.
- * Populates text convenience accessors on validated input objects.
+ *
+ * With the removal of text convenience accessors (`inputText`, `outputText`,
+ * `expectedOutputText`) from CodeGraderInput, this module is a no-op pass-through.
+ * Kept for backward compatibility — existing runtimes call `enrichInput()` and
+ * the call is harmless.
  */
 import type { CodeGraderInput } from './schemas.js';
 
 /**
- * Populate `inputText`, `outputText`, and `expectedOutputText` accessors
- * on the validated input object.
+ * Enrich a validated CodeGraderInput.
  *
- * Text accessors are always strings. Structured fields (`input`, `output`, `expectedOutput`)
- * remain `Message[]` always.
+ * Previously populated text convenience accessors; now a no-op pass-through since
+ * those fields were removed. Code graders should extract text from `Message.content`
+ * using `getTextContent()` from `@agentv/core` instead.
  */
 export function enrichInput(input: CodeGraderInput): CodeGraderInput {
-  // Ensure expectedOutputText is always a string (may be undefined from schema)
-  if (input.expectedOutputText === undefined) {
-    Object.defineProperty(input, 'expectedOutputText', {
-      value: '',
-      writable: false,
-      configurable: true,
-      enumerable: true,
-    });
-  }
-
   return input;
 }
diff --git a/packages/eval/src/index.ts b/packages/eval/src/index.ts
index 49c740167..c814b698d 100644
--- a/packages/eval/src/index.ts
+++ b/packages/eval/src/index.ts
@@ -8,9 +8,12 @@
  * #!/usr/bin/env bun
  * import { defineAssertion } from '@agentv/eval';
  *
- * export default defineAssertion(({ outputText }) => ({
- *   pass: outputText.includes('hello'),
- *   assertions: [{ text: 'Checks greeting', passed: outputText.includes('hello') }],
+ * export default defineAssertion(({ output, criteria }) => {
+ *   const text = output?.map(m => String(m.content ?? '')).join(' ') ?? '';
+ *   return {
+ *     pass: text.includes('hello'),
+ *     assertions: [{ text: 'Checks greeting', passed: text.includes('hello') }],
+ *   };
  * }));
  * ```
  *
@@ -19,33 +22,15 @@
  * #!/usr/bin/env bun
  * import { defineCodeGrader } from '@agentv/eval';
  *
- * export default defineCodeGrader(({ trace, outputText }) => ({
- *   score: trace?.eventCount <= 5 ? 1.0 : 0.5,
- *   assertions: [{ text: 'Efficient tool usage', passed: trace?.eventCount <= 5 }],
+ * export default defineCodeGrader(({ trace, output }) => {
+ *   const text = output?.map(m => String(m.content ?? '')).join(' ') ?? '';
+ *   return {
+ *     score: trace?.eventCount <= 5 ? 1.0 : 0.5,
+ *     assertions: [{ text: 'Efficient tool usage', passed: trace?.eventCount <= 5 }],
+ *   };
  * }));
  * ```
  *
- * @example Code grader with target access (requires `target` config in YAML)
- * ```typescript
- * #!/usr/bin/env bun
- * import { defineCodeGrader, createTargetClient } from '@agentv/eval';
- *
- * export default defineCodeGrader(async ({ inputText }) => {
- *   const target = createTargetClient();
- *   if (!target) {
- *     return { score: 0, assertions: [{ text: 'Target not available', passed: false }] };
- *   }
- *
- *   const response = await target.invoke({
- *     question: `Evaluate: ${inputText}`,
- *     systemPrompt: 'Respond with JSON: { "score": 0-1 }'
- *   });
- *
- *   const result = JSON.parse(response.rawText ?? '{}');
- *   return { score: result.score ?? 0 };
- * });
- * ```
- *
  * @packageDocumentation
  */
 
@@ -60,7 +45,6 @@ export {
   PromptTemplateInputSchema,
   type CodeGraderInput,
   type CodeGraderResult,
-  type EnrichedCodeGraderInput,
   type TraceSummary,
   type Message,
   type ToolCall,
@@ -161,25 +145,10 @@ export function defineCodeGrader(handler: CodeGraderHandler): void {
  * ```typescript
  * import { definePromptTemplate } from '@agentv/eval';
  *
- * export default definePromptTemplate((ctx) => `
- *   Question: ${ctx.inputText}
- *   Answer: ${ctx.outputText}
- *
- *   ${ctx.expectedOutputText ? `Reference: ${ctx.expectedOutputText}` : ''}
- * `);
- * ```
- *
- * @example With conditional logic
- * ```typescript
- * import { definePromptTemplate } from '@agentv/eval';
- *
  * export default definePromptTemplate((ctx) => {
- *   const rubric = ctx.config?.rubric as string | undefined;
- *   return `
- *     Question: ${ctx.inputText}
- *     Candidate Answer: ${ctx.outputText}
- *     ${rubric ? `\nEvaluation Criteria:\n${rubric}` : ''}
- *   `;
+ *   const question = ctx.input.map(m => String(m.content ?? '')).join('\n');
+ *   const answer = ctx.output?.map(m => String(m.content ?? '')).join('\n') ?? '';
+ *   return `Question: ${question}\nAnswer: ${answer}`;
  * });
  * ```
  */
@@ -209,9 +178,12 @@ export function definePromptTemplate(handler: PromptTemplateHandler): void {
  * ```typescript
  * import { defineAssertion } from '@agentv/eval';
  *
- * export default defineAssertion(({ outputText }) => ({
- *   pass: outputText.toLowerCase().includes('hello'),
- *   assertions: [{ text: 'Checks for greeting', passed: outputText.toLowerCase().includes('hello') }],
+ * export default defineAssertion(({ output }) => {
+ *   const text = output?.map(m => String(m.content ?? '')).join(' ') ?? '';
+ *   return {
+ *     pass: text.toLowerCase().includes('hello'),
+ *     assertions: [{ text: 'Checks for greeting', passed: text.toLowerCase().includes('hello') }],
+ *   };
  * }));
  * ```
  *
@@ -219,8 +191,9 @@ export function definePromptTemplate(handler: PromptTemplateHandler): void {
  * ```typescript
  * import { defineAssertion } from '@agentv/eval';
  *
- * export default defineAssertion(({ outputText, trace }) => {
- *   const hasContent = outputText.length > 0 ? 0.5 : 0;
+ * export default defineAssertion(({ output, trace }) => {
+ *   const text = output?.map(m => String(m.content ?? '')).join(' ') ?? '';
+ *   const hasContent = text.length > 0 ? 0.5 : 0;
  *   const isEfficient = (trace?.eventCount ?? 0) <= 5 ? 0.5 : 0;
  *   return {
  *     score: hasContent + isEfficient,
@@ -229,7 +202,7 @@ export function definePromptTemplate(handler: PromptTemplateHandler): void {
  *       { text: 'Efficient', passed: !!isEfficient },
  *     ],
  *   };
- * });
+ * }));
  * ```
  */
 export function defineAssertion(handler: AssertionHandler): void {
diff --git a/packages/eval/src/prompt-template.ts b/packages/eval/src/prompt-template.ts
index 09e7f8e5b..c3669e5e6 100644
--- a/packages/eval/src/prompt-template.ts
+++ b/packages/eval/src/prompt-template.ts
@@ -6,16 +6,13 @@ import { readFileSync } from 'node:fs';
 
 import { toCamelCaseDeep } from './case-conversion.js';
 import { enrichInput } from './deprecation.js';
-import { type EnrichedCodeGraderInput, PromptTemplateInputSchema } from './schemas.js';
+import { type CodeGraderInput, PromptTemplateInputSchema } from './schemas.js';
 
 /**
  * Handler function type for prompt templates.
  * Returns the prompt string to use for evaluation.
- *
- * The input is enriched at runtime: `inputText`, `outputText`, and
- * `expectedOutputText` are always populated before the handler is called.
  */
-export type PromptTemplateHandler = (input: EnrichedCodeGraderInput) => string | Promise<string>;
+export type PromptTemplateHandler = (input: CodeGraderInput) => string | Promise<string>;
 
 /**
  * Read stdin synchronously (works in both Node.js and Bun).
@@ -42,11 +39,11 @@ export async function runPromptTemplate(handler: PromptTemplateHandler): Promise
     // 4. Validate input with Zod
     const input = PromptTemplateInputSchema.parse(camelInput);
 
-    // 5. Enrich input with text accessors and deprecation warnings
+    // 5. Enrich input — no-op pass-through
     enrichInput(input);
 
-    // 6. Run handler (input is now enriched with guaranteed text accessors)
-    const prompt = await handler(input as EnrichedCodeGraderInput);
+    // 6. Run handler
+    const prompt = await handler(input);
 
     // 6. Output raw string (not JSON) - the prompt itself
     console.log(prompt);
@@ -71,37 +68,13 @@ export async function runPromptTemplate(handler: PromptTemplateHandler): Promise
  *
  * @example
  * ```typescript
- * import { definePromptTemplate } from '@agentv/eval';
- *
- * export default definePromptTemplate((ctx) => `
- *   Question: ${ctx.inputText}
- *   Answer: ${ctx.outputText}
- *
- *   ${ctx.expectedOutputText ? `Reference: ${ctx.expectedOutputText}` : ''}
- * `);
- * ```
- *
- * @example With conditional logic
- * ```typescript
- * import { definePromptTemplate } from '@agentv/eval';
- *
- * export default definePromptTemplate((ctx) => {
- *   const rubric = ctx.config?.rubric as string | undefined;
- *   return `
- *     Question: ${ctx.inputText}
- *     Candidate Answer: ${ctx.outputText}
- *     ${rubric ? `\nEvaluation Criteria:\n${rubric}` : ''}
- *   `;
- * });
- * ```
- *
- * @example Async handler
- * ```typescript
- * import { definePromptTemplate } from '@agentv/eval';
+ * import { definePromptTemplate, type CodeGraderInput } from '@agentv/eval';
+ * import { getTextContent } from '@agentv/core';
  *
- * export default definePromptTemplate(async (ctx) => {
- *   // Async operations are supported
- *   return `Question: ${ctx.inputText}\nAnswer: ${ctx.outputText}`;
+ * export default definePromptTemplate((ctx: CodeGraderInput) => {
+ *   const question = ctx.input.map(m => getTextContent(m.content)).join('\n');
+ *   const answer = ctx.output?.map(m => getTextContent(m.content)).join('\n') ?? '';
+ *   return `Question: ${question}\nAnswer: ${answer}`;
  * });
  * ```
  */
diff --git a/packages/eval/src/runtime.ts b/packages/eval/src/runtime.ts
index 2363cd3b2..42099dce6 100644
--- a/packages/eval/src/runtime.ts
+++ b/packages/eval/src/runtime.ts
@@ -11,17 +11,13 @@ import {
   CodeGraderInputSchema,
   type CodeGraderResult,
   CodeGraderResultSchema,
-  type EnrichedCodeGraderInput,
 } from './schemas.js';
 
 /**
  * Handler function type for code graders.
- *
- * The input is enriched at runtime: `inputText`, `outputText`, and
- * `expectedOutputText` are always populated before the handler is called.
  */
 export type CodeGraderHandler = (
-  input: EnrichedCodeGraderInput,
+  input: CodeGraderInput,
 ) => CodeGraderResult | Promise<CodeGraderResult>;
 
 /**
@@ -85,11 +81,11 @@ export async function runCodeGrader(handler: CodeGraderHandler): Promise<void> {
       });
     }
 
-    // 6. Enrich input with text accessors and deprecation warnings
+    // 6. Enrich input — no-op pass-through
     enrichInput(input);
 
-    // 7. Run handler (input is now enriched with guaranteed text accessors)
-    const rawResult = await handler(input as EnrichedCodeGraderInput);
+    // 7. Run handler
+    const rawResult = await handler(input);
 
     // 8. Validate and normalize output
     const result = CodeGraderResultSchema.parse({
diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts
index 43b541bbc..3385ac5dd 100644
--- a/packages/eval/src/schemas.ts
+++ b/packages/eval/src/schemas.ts
@@ -54,14 +54,12 @@ export const MessageSchema = z.object({
 /**
  * Code grader input schema (camelCase, converted from snake_case wire format).
  *
- * Text convenience accessors (`inputText`, `outputText`, `expectedOutputText`) are always
- * strings. Structured fields (`input`, `output`, `expectedOutput`) are always `Message[]`.
+ * Structured fields (`input`, `output`, `expectedOutput`) are always `Message[]`.
+ * To extract plain text from message content, use `getTextContent()` from `@agentv/core`.
  */
 export const CodeGraderInputSchema = z.object({
   criteria: z.string(),
   expectedOutput: z.array(MessageSchema),
-  /** Last assistant message content as string. */
-  outputText: z.string(),
   output: z.array(MessageSchema).nullable().optional(),
   /** Path to a temp file containing the output JSON (used for large payloads). */
   outputPath: z.string().optional(),
@@ -76,10 +74,6 @@ export const CodeGraderInputSchema = z.object({
   fileChanges: z.string().nullable().optional(),
   workspacePath: z.string().nullable().optional(),
   config: z.record(z.unknown()).nullable().optional(),
-  /** All input messages as plain text. Single message: content only. Multiple: @role prefixed. */
-  inputText: z.string(),
-  /** Last expected output message content as plain text. */
-  expectedOutputText: z.string().optional(),
 });
 
 /**
@@ -107,20 +101,6 @@ export const CodeGraderResultSchema = z.object({
 export type CodeGraderInput = z.infer<typeof CodeGraderInputSchema>;
 export type CodeGraderResult = z.infer<typeof CodeGraderResultSchema>;
 
-/**
- * CodeGraderInput after `enrichInput()` has run.
- *
- * The text accessors (`inputText`, `outputText`, `expectedOutputText`)
- * are always populated by the runtime before the handler is called, so they are
- * guaranteed to be `string` (never `undefined`).
- *
- * Handler function signatures (`CodeGraderHandler`, `AssertionHandler`) use this
- * type so that user code can destructure `{ outputText }` without null-checks.
- */
-export type EnrichedCodeGraderInput = Omit<CodeGraderInput, 'expectedOutputText'> & {
-  /** Expected output content as string. */
-  readonly expectedOutputText: string;
-};
 export type TraceSummary = z.infer<typeof TraceSummarySchema>;
 export type Message = z.infer<typeof MessageSchema>;
 export type ToolCall = z.infer<typeof ToolCallSchema>;
diff --git a/packages/eval/test/define-code-grader.test.ts b/packages/eval/test/define-code-grader.test.ts
index 6fcfb8014..67a77e878 100644
--- a/packages/eval/test/define-code-grader.test.ts
+++ b/packages/eval/test/define-code-grader.test.ts
@@ -11,18 +11,15 @@ import {
 
 describe('CodeGraderInputSchema', () => {
   const validInput = {
-    inputText: 'What is 2+2?',
     criteria: 'The answer should be 4',
     expectedOutput: [{ role: 'assistant', content: '4' }],
-    outputText: 'The answer is 4',
     inputFiles: [],
     input: [{ role: 'user', content: 'What is 2+2?' }],
   };
 
   it('parses valid input', () => {
     const result = CodeGraderInputSchema.parse(validInput);
-    expect(result.inputText).toBe('What is 2+2?');
-    expect(result.outputText).toBe('The answer is 4');
+    expect(result.criteria).toBe('The answer should be 4');
   });
 
   it('accepts optional trace', () => {
@@ -173,15 +170,13 @@ describe('CodeGraderResultSchema', () => {
 describe('CodeJudgeInputSchema (backward-compat alias)', () => {
   it('parses valid input via deprecated alias', () => {
     const validInput = {
-      inputText: 'What is 2+2?',
       criteria: 'The answer should be 4',
       expectedOutput: [{ role: 'assistant', content: '4' }],
-      outputText: 'The answer is 4',
       inputFiles: [],
       input: [{ role: 'user', content: 'What is 2+2?' }],
     };
     const result = CodeJudgeInputSchema.parse(validInput);
-    expect(result.inputText).toBe('What is 2+2?');
+    expect(result.criteria).toBe('The answer should be 4');
   });
 });
 
diff --git a/packages/eval/test/define-prompt-template.test.ts b/packages/eval/test/define-prompt-template.test.ts
index 9e335fbd0..890b80201 100644
--- a/packages/eval/test/define-prompt-template.test.ts
+++ b/packages/eval/test/define-prompt-template.test.ts
@@ -5,18 +5,14 @@ import { PromptTemplateInputSchema } from '../src/schemas.js';
 describe('PromptTemplateInputSchema', () => {
   // Minimal valid input with all required fields
   const validInput = {
-    inputText: 'What is 2+2?',
     criteria: 'The answer should be 4',
     expectedOutput: [],
-    outputText: 'The answer is 4',
     inputFiles: [],
     input: [],
   };
 
   it('parses valid input with all required fields', () => {
     const result = PromptTemplateInputSchema.parse(validInput);
-    expect(result.inputText).toBe('What is 2+2?');
-    expect(result.outputText).toBe('The answer is 4');
     expect(result.criteria).toBe('The answer should be 4');
     expect(result.expectedOutput).toEqual([]);
     expect(result.inputFiles).toEqual([]);
@@ -30,15 +26,6 @@ describe('PromptTemplateInputSchema', () => {
     expect(() => PromptTemplateInputSchema.parse(minimalInput)).toThrow();
   });
 
-  it('accepts optional expectedOutputText', () => {
-    const inputWithReference = {
-      ...validInput,
-      expectedOutputText: 'The sum of 2 and 2 is 4',
-    };
-    const result = PromptTemplateInputSchema.parse(inputWithReference);
-    expect(result.expectedOutputText).toBe('The sum of 2 and 2 is 4');
-  });
-
   it('accepts optional trace', () => {
     const inputWithTrace = {
       ...validInput,
@@ -115,11 +102,8 @@ describe('PromptTemplateInputSchema', () => {
 
   it('accepts full input with all fields', () => {
     const fullInput = {
-      inputText: 'What is 2+2?',
       criteria: 'The answer should be 4',
       expectedOutput: [{ role: 'assistant', content: '4' }],
-      expectedOutputText: 'The sum is 4',
-      outputText: 'The answer is 4',
       output: [{ role: 'assistant', content: 'The answer is 4' }],
       inputFiles: ['/path/to/input.txt'],
       input: [{ role: 'user', content: 'What is 2+2?' }],
@@ -131,10 +115,7 @@ describe('PromptTemplateInputSchema', () => {
       config: { rubric: 'Check correctness' },
     };
     const result = PromptTemplateInputSchema.parse(fullInput);
-    expect(result.inputText).toBe('What is 2+2?');
     expect(result.criteria).toBe('The answer should be 4');
-    expect(result.expectedOutputText).toBe('The sum is 4');
-    expect(result.outputText).toBe('The answer is 4');
     expect(result.config).toEqual({ rubric: 'Check correctness' });
   });
 });
diff --git a/packages/eval/test/deprecation.test.ts b/packages/eval/test/deprecation.test.ts
index 7bfd5ac62..e025fd973 100644
--- a/packages/eval/test/deprecation.test.ts
+++ b/packages/eval/test/deprecation.test.ts
@@ -10,45 +10,17 @@ function buildInput(overrides?: Record<string, unknown>) {
   return CodeGraderInputSchema.parse({
     criteria: 'The answer should be 4',
     expectedOutput: [{ role: 'assistant', content: '4' }],
-    outputText: 'The answer is 4',
     inputFiles: [],
     input: [{ role: 'user', content: 'What is 2+2?' }],
-    inputText: 'What is 2+2?',
     ...overrides,
   });
 }
 
-describe('enrichInput — text accessors', () => {
-  it('preserves inputText value', () => {
-    const input = buildInput({ inputText: 'Hello world' });
-    enrichInput(input);
-    expect(input.inputText).toBe('Hello world');
-  });
-
-  it('preserves outputText value', () => {
-    const input = buildInput({ outputText: 'The result is 42' });
-    enrichInput(input);
-    expect(input.outputText).toBe('The result is 42');
-  });
-
-  it('populates expectedOutputText from schema value', () => {
-    const input = buildInput({ expectedOutputText: 'Expected text' });
-    enrichInput(input);
-    expect(input.expectedOutputText).toBe('Expected text');
-  });
-
-  it('populates expectedOutputText as empty string when undefined', () => {
-    const input = buildInput({ expectedOutputText: undefined });
-    enrichInput(input);
-    expect(input.expectedOutputText).toBe('');
-  });
-
-  it('text accessors are always strings', () => {
+describe('enrichInput — pass-through', () => {
+  it('returns the same object unchanged', () => {
     const input = buildInput();
-    enrichInput(input);
-    expect(typeof input.inputText).toBe('string');
-    expect(typeof input.outputText).toBe('string');
-    expect(typeof input.expectedOutputText).toBe('string');
+    const result = enrichInput(input);
+    expect(result).toBe(input);
   });
 
   it('structured fields (input, output, expectedOutput) remain Message[]', () => {
@@ -63,58 +35,3 @@ describe('enrichInput — text accessors', () => {
     expect(Array.isArray(input.expectedOutput)).toBe(true);
   });
 });
-
-describe('CodeGraderInputSchema — fields', () => {
-  it('accepts inputText, outputText, expectedOutputText in schema', () => {
-    const input = CodeGraderInputSchema.parse({
-      criteria: 'The answer should be 4',
-      expectedOutput: [{ role: 'assistant', content: '4' }],
-      inputFiles: [],
-      input: [{ role: 'user', content: 'What is 2+2?' }],
-      inputText: 'What is 2+2?',
-      outputText: 'The answer is 4',
-      expectedOutputText: 'The answer is 4',
-    });
-    expect(input.inputText).toBe('What is 2+2?');
-    expect(input.outputText).toBe('The answer is 4');
-    expect(input.expectedOutputText).toBe('The answer is 4');
-  });
-
-  it('inputText is required in schema', () => {
-    expect(() =>
-      CodeGraderInputSchema.parse({
-        criteria: 'The answer should be 4',
-        expectedOutput: [{ role: 'assistant', content: '4' }],
-        outputText: 'The answer is 4',
-        inputFiles: [],
-        input: [{ role: 'user', content: 'What is 2+2?' }],
-      }),
-    ).toThrow();
-  });
-
-  it('expectedOutputText is optional in schema', () => {
-    const input = CodeGraderInputSchema.parse({
-      criteria: 'The answer should be 4',
-      expectedOutput: [{ role: 'assistant', content: '4' }],
-      outputText: 'The answer is 4',
-      inputFiles: [],
-      input: [{ role: 'user', content: 'What is 2+2?' }],
-      inputText: 'What is 2+2?',
-    });
-    expect(input.expectedOutputText).toBeUndefined();
-  });
-
-  it('does not accept deprecated question field', () => {
-    expect(() =>
-      CodeGraderInputSchema.parse({
-        question: 'What is 2+2?',
-        criteria: 'The answer should be 4',
-        expectedOutput: [{ role: 'assistant', content: '4' }],
-        outputText: 'The answer is 4',
-        inputFiles: [],
-        input: [{ role: 'user', content: 'What is 2+2?' }],
-        inputText: 'What is 2+2?',
-      }),
-    ).not.toThrow(); // extra fields are stripped by zod by default
-  });
-});
diff --git a/packages/eval/test/file-backed-output.test.ts b/packages/eval/test/file-backed-output.test.ts
index 3b569a50b..58e931f3e 100644
--- a/packages/eval/test/file-backed-output.test.ts
+++ b/packages/eval/test/file-backed-output.test.ts
@@ -7,10 +7,8 @@ import { type CodeGraderInput, CodeGraderInputSchema } from '../src/schemas.js';
 
 describe('CodeGraderInputSchema with outputPath', () => {
   const validInput = {
-    inputText: 'What is 2+2?',
     criteria: 'The answer should be 4',
     expectedOutput: [{ role: 'assistant', content: '4' }],
-    outputText: 'The answer is 4',
     inputFiles: [],
     input: [{ role: 'user', content: 'What is 2+2?' }],
   };
@@ -58,10 +56,8 @@ describe('Lazy file-backed output loading', () => {
     writeFileSync(filePath, JSON.stringify(messages));
 
     const input: CodeGraderInput = CodeGraderInputSchema.parse({
-      inputText: 'test',
       criteria: 'test',
       expectedOutput: [],
-      outputText: 'test',
       output: null,
       outputPath: filePath,
       inputFiles: [],
@@ -93,10 +89,8 @@ describe('Lazy file-backed output loading', () => {
 
   it('uses inline output when outputPath is absent', () => {
     const input: CodeGraderInput = CodeGraderInputSchema.parse({
-      inputText: 'test',
       criteria: 'test',
       expectedOutput: [],
-      outputText: 'test',
       output: [{ role: 'assistant', content: 'inline' }],
       inputFiles: [],
       input: [],