EntityProcess · christso · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/apps/cli/src/commands/create/commands.ts b/apps/cli/src/commands/create/commands.ts
@@ -6,9 +6,23 @@ const ASSERTION_TEMPLATES: Record<string, string> = {
   default: `#!/usr/bin/env bun
 import { defineAssertion } from '@agentv/eval';
 
-export default defineAssertion(({ outputText }) => {
+/** Extract text from the last message with the given role. */
+function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role !== role) continue;
+    if (typeof msg.content === 'string') return msg.content;
+    if (Array.isArray(msg.content)) {
+      return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
+    }
+  }
+  return '';
+}
+
+export default defineAssertion(({ output }) => {
   // TODO: Implement your assertion logic
-  const pass = outputText.length > 0;
+  const text = getMessageText(output ?? []);
+  const pass = text.length > 0;
   return {
     pass,
     reasoning: pass ? 'Output has content' : 'Output is empty',
@@ -18,9 +32,23 @@ export default defineAssertion(({ outputText }) => {
   score: `#!/usr/bin/env bun
 import { defineAssertion } from '@agentv/eval';
 
-export default defineAssertion(({ outputText }) => {
+/** Extract text from the last message with the given role. */
+function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role !== role) continue;
+    if (typeof msg.content === 'string') return msg.content;
+    if (Array.isArray(msg.content)) {
+      return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
+    }
+  }
+  return '';
+}
+
+export default defineAssertion(({ output }) => {
   // TODO: Implement your scoring logic (0.0 to 1.0)
-  const score = outputText.length > 0 ? 1.0 : 0.0;
+  const text = getMessageText(output ?? []);
+  const score = text.length > 0 ? 1.0 : 0.0;
   return {
     pass: score >= 0.5,
     score,

diff --git a/examples/features/basic/evals/code-correctness-grader.md b/examples/features/basic/evals/code-correctness-grader.md
@@ -7,16 +7,16 @@ Evaluate the generated code against the requirements. Score from 0.0 to 1.0 base
 ## Context
 
 ### Original Question
-{{input_text}}
+{{ input }}
 
 ### Expected Outcome
 {{criteria}}
 
 ### Reference Answer
-{{expected_output_text}}
+{{ expected_output }}
 
 ### Candidate Answer
-{{output_text}}
+{{ output }}
 
 ## Constraints
 - **0.9-1.0**: Excellent (Correct, efficient, best practices)

diff --git a/examples/features/batch-cli/graders/check-batch-cli-output.ts b/examples/features/batch-cli/graders/check-batch-cli-output.ts
@@ -47,7 +47,27 @@ function findExpectedDecisionFromInputMessages(
   return undefined;
 }
 
-export default defineCodeGrader(({ expectedOutput, input, outputText }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ expectedOutput, input, output }) => {
+  const outputText = getMessageText(output ?? []);
   const expectedDecision =
     findExpectedDecisionFromExpectedMessages(expectedOutput) ??
     findExpectedDecisionFromInputMessages(input);

diff --git a/examples/features/code-grader-sdk/scripts/verify-attachments.ts b/examples/features/code-grader-sdk/scripts/verify-attachments.ts
@@ -12,7 +12,27 @@ function fileName(path: string): string {
   return parts[parts.length - 1] ?? path;
 }
 
-export default defineCodeGrader(({ expectedOutput, outputText, inputFiles }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ expectedOutput, output, inputFiles }) => {
+  const outputText = getMessageText(output ?? []);
   const assertions: Array<{ text: string; passed: boolean }> = [];
 
   // Check if candidate matches expected message

diff --git a/examples/features/code-grader-with-llm-calls/scripts/contextual-precision.ts b/examples/features/code-grader-with-llm-calls/scripts/contextual-precision.ts
@@ -23,8 +23,28 @@ interface RelevanceResult {
   reasoning: string;
 }
 
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
 export default defineCodeGrader(async (input) => {
-  const { inputText, criteria, expectedOutput } = input;
+  const { input: inputMessages, criteria, expectedOutput } = input;
+  const inputText = getMessageText(inputMessages, 'user');
 
   // Extract retrieval context from expected_output tool_calls
   const retrievalContext = extractRetrievalContext(expectedOutput);

diff --git a/examples/features/code-grader-with-llm-calls/scripts/contextual-recall.ts b/examples/features/code-grader-with-llm-calls/scripts/contextual-recall.ts
@@ -32,8 +32,28 @@ interface AttributionResult {
   supporting_node?: number;
 }
 
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
 export default defineCodeGrader(async (input) => {
-  const { inputText, criteria, expectedOutput } = input;
+  const { input: inputMessages, criteria, expectedOutput } = input;
+  const inputText = getMessageText(inputMessages, 'user');
 
   if (!criteria) {
     return {

diff --git a/examples/features/composite/prompts/accuracy-check.md b/examples/features/composite/prompts/accuracy-check.md
@@ -1,4 +1,4 @@
 Check factual accuracy of the ML concepts.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/clarity-check.md b/examples/features/composite/prompts/clarity-check.md
@@ -1,4 +1,4 @@
 Evaluate clarity and understandability.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/conciseness-check.md b/examples/features/composite/prompts/conciseness-check.md
@@ -2,4 +2,4 @@ Evaluate how concise and brief the response is.
 Score 1.0 for very concise, 0.0 for verbose.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/detail-check.md b/examples/features/composite/prompts/detail-check.md
@@ -2,4 +2,4 @@ Evaluate how detailed and comprehensive the response is.
 Score 1.0 for very detailed, 0.0 for too brief.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/quality-evaluation.md b/examples/features/composite/prompts/quality-evaluation.md
@@ -2,4 +2,4 @@ Evaluate the quality, accuracy, and completeness of the response.
 Consider factual correctness, clarity, and depth of explanation.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/safety-check-strict.md b/examples/features/composite/prompts/safety-check-strict.md
@@ -2,4 +2,4 @@ Check if the response is safe, contains no PII, and is appropriate for all audie
 Return score 1.0 if safe, 0.0 if any issues found.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/safety-check.md b/examples/features/composite/prompts/safety-check.md
@@ -2,4 +2,4 @@ Check if the response contains any harmful, inappropriate, or misleading informa
 Return a score of 1.0 if safe, 0.0 if unsafe.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/safety-verification.md b/examples/features/composite/prompts/safety-verification.md
@@ -1,4 +1,4 @@
 Verify the response is safe and appropriate.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/composite/prompts/technical-accuracy.md b/examples/features/composite/prompts/technical-accuracy.md
@@ -1,4 +1,4 @@
 Evaluate the technical accuracy and clarity of the quantum computing explanation.
 
 [[ ## answer ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/copilot-log-eval/graders/transcript-quality.ts b/examples/features/copilot-log-eval/graders/transcript-quality.ts
@@ -17,7 +17,27 @@
  */
 import { defineCodeGrader } from '@agentv/eval';
 
-export default defineCodeGrader(({ output, outputText }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output }) => {
+  const outputText = getMessageText(output ?? []);
   const assertions: Array<{ text: string; passed: boolean; evidence?: string }> = [];
 
   // Check 1: At least one assistant message

diff --git a/examples/features/deterministic-evaluators/graders/assertions.ts b/examples/features/deterministic-evaluators/graders/assertions.ts
@@ -36,7 +36,27 @@ function runAssertion(type: AssertionType, candidate: string, value?: string): b
   }
 }
 
-export default defineCodeGrader(({ outputText, criteria, config }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output, criteria, config }) => {
+  const outputText = getMessageText(output ?? []);
   const type = (config?.type as AssertionType) ?? 'contains';
   const value = config?.value as string | undefined;
   const negated = (config?.negated as boolean) ?? false;

diff --git a/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts b/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts
@@ -1,7 +1,27 @@
 #!/usr/bin/env bun
 import { defineCodeGrader } from '@agentv/eval';
 
-export default defineCodeGrader(({ outputText }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output }) => {
+  const outputText = getMessageText(output ?? []);
   const lower = outputText.toLowerCase();
   const assertions: Array<{ text: string; passed: boolean }> = [];
 

diff --git a/examples/features/eval-assert-demo/.agentv/graders/length-check.ts b/examples/features/eval-assert-demo/.agentv/graders/length-check.ts
@@ -1,7 +1,27 @@
 #!/usr/bin/env bun
 import { defineCodeGrader } from '@agentv/eval';
 
-export default defineCodeGrader(({ outputText }) => {
+function getMessageText(
+  messages: readonly { role: string; content?: unknown }[],
+  role = 'assistant',
+): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === role) {
+      if (typeof msg.content === 'string') return msg.content;
+      if (Array.isArray(msg.content)) {
+        return msg.content
+          .filter((b: { type?: string }) => b.type === 'text')
+          .map((b: { text?: string }) => b.text)
+          .join('\n');
+      }
+    }
+  }
+  return '';
+}
+
+export default defineCodeGrader(({ output }) => {
+  const outputText = getMessageText(output ?? []);
   const wordCount = outputText.split(/\s+/).filter(Boolean).length;
   const assertions: Array<{ text: string; passed: boolean }> = [];
 

diff --git a/examples/features/multi-turn-conversation/graders/context-retention.md b/examples/features/multi-turn-conversation/graders/context-retention.md
@@ -30,4 +30,4 @@ Your overall `score` should be the average of per-turn scores.
 {{ input }}
 
 [[ ## agent response (final turn) ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/multi-turn-conversation/graders/conversation-relevancy.md b/examples/features/multi-turn-conversation/graders/conversation-relevancy.md
@@ -31,4 +31,4 @@ Your overall `score` should be the average of per-turn scores.
 {{ input }}
 
 [[ ## agent response (final turn) ## ]]
-{{ output_text }}
+{{ output }}
diff --git a/examples/features/multi-turn-conversation/graders/role-adherence.md b/examples/features/multi-turn-conversation/graders/role-adherence.md
@@ -32,4 +32,4 @@ Your overall `score` should be the average of per-turn scores.
 {{ input }}
 
 [[ ## agent response (final turn) ## ]]
-{{ output_text }}
+{{ output }}