From e2670cf7449b835c852c4985df2b236e05707fd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Rivas?= <andresfelipe.rivas@salesforce.com>
Date: Tue, 5 May 2026 13:46:30 -0300
Subject: [PATCH 1/3] feat(agent/test/run-eval): promote command to GA and
 delegate logic to @salesforce/agents

W-22203426

Removes local copies of evalNormalizer, evalFormatter, and yamlSpecTranslator
now that they live in the shared library. Rewrites run-eval.ts as a thin CLI
shell that imports normalizePayload, translateTestSpec, resolveAgent,
executeBatches, buildResultSummary, and formatResults from @salesforce/agents.
Promotes the command from state='beta'/hidden to state='ga'.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/commands/agent/test/run-eval.ts       | 155 +------
 src/evalFormatter.ts                      | 355 ---------------
 src/evalNormalizer.ts                     | 500 ----------------------
 src/yamlSpecTranslator.ts                 | 265 ------------
 test/commands/agent/test/run-eval.test.ts | 360 ++++++++++++++++
 test/evalFormatter.test.ts                |   2 +-
 test/evalNormalizer.test.ts               |   2 +-
 test/yamlSpecTranslator.test.ts           |   2 +-
 8 files changed, 381 insertions(+), 1260 deletions(-)
 delete mode 100644 src/evalFormatter.ts
 delete mode 100644 src/evalNormalizer.ts
 delete mode 100644 src/yamlSpecTranslator.ts
 create mode 100644 test/commands/agent/test/run-eval.test.ts
diff --git a/src/commands/agent/test/run-eval.ts b/src/commands/agent/test/run-eval.ts
index d2569526..d0187566 100644
--- a/src/commands/agent/test/run-eval.ts
+++ b/src/commands/agent/test/run-eval.ts
@@ -16,11 +16,22 @@
 
 import { readFile } from 'node:fs/promises';
 import { Flags, SfCommand, toHelpSection } from '@salesforce/sf-plugins-core';
-import { EnvironmentVariable, Messages, Org, SfError } from '@salesforce/core';
-import { type EvalPayload, normalizePayload, splitIntoBatches } from '../../../evalNormalizer.js';
-import { type EvalApiResponse, formatResults, type ResultFormat } from '../../../evalFormatter.js';
+import { EnvironmentVariable, Messages, SfError } from '@salesforce/core';
+import {
+  type EvalPayload,
+  normalizePayload,
+  splitIntoBatches,
+  type EvalApiResponse,
+  formatResults,
+  type ResultFormat,
+  isYamlTestSpec,
+  parseTestSpec,
+  translateTestSpec,
+  resolveAgent,
+  executeBatches,
+  buildResultSummary,
+} from '@salesforce/agents';
 import { resultFormatFlag } from '../../../flags.js';
-import { isYamlTestSpec, parseTestSpec, translateTestSpec } from '../../../yamlSpecTranslator.js';
 
 Messages.importMessagesDirectoryFromMetaUrl(import.meta.url);
 const messages = Messages.loadMessages('@salesforce/plugin-agent', 'agent.test.run-eval');
@@ -30,132 +41,11 @@ export type RunEvalResult = {
   summary: { passed: number; failed: number; scored: number; errors: number };
 };
 
-// --- Standalone helper functions ---
-
-type ApiHeaders = {
-  orgId: string;
-  userId: string;
-  instanceUrl: string;
-};
-
-async function getApiHeaders(org: Org): Promise<ApiHeaders> {
-  const conn = org.getConnection();
-  const userInfo = await conn.request<{ user_id: string }>(`${conn.instanceUrl}/services/oauth2/userinfo`);
-
-  return {
-    orgId: org.getOrgId(),
-    userId: userInfo.user_id,
-    instanceUrl: conn.instanceUrl,
-  };
-}
-
-async function callEvalApi(org: Org, payload: EvalPayload, headers: ApiHeaders): Promise<{ results?: unknown[] }> {
-  const conn = org.getConnection();
-
-  return conn.request<{ results?: unknown[] }>({
-    url: 'https://api.salesforce.com/einstein/evaluation/v1/tests',
-    method: 'POST',
-    headers: {
-      'Content-Type': 'application/json',
-      'x-sfdc-core-tenant-id': `core/prod/${headers.orgId}`,
-      'x-org-id': headers.orgId,
-      'x-sfdc-core-instance-url': headers.instanceUrl,
-      'x-sfdc-user-id': headers.userId,
-      'x-client-feature-id': 'AIPlatformEvaluation',
-      'x-sfdc-app-context': 'EinsteinGPT',
-    },
-    body: JSON.stringify(payload),
-  });
-}
-
-async function resolveAgent(org: Org, apiName: string): Promise<{ agentId: string; versionId: string }> {
-  const conn = org.getConnection();
-
-  // Escape single quotes to prevent SOQL injection
-  const escapedApiName = apiName.replace(/'/g, "\\'");
-
-  const botResult = await conn.query<{ Id: string }>(
-    `SELECT Id FROM BotDefinition WHERE DeveloperName = '${escapedApiName}'`
-  );
-  if (!botResult.records.length) {
-    throw messages.createError('error.agentNotFound', [apiName]);
-  }
-  const agentId = botResult.records[0].Id;
-
-  // Filter to published/active versions only
-  const versionResult = await conn.query<{ Id: string }>(
-    `SELECT Id FROM BotVersion WHERE BotDefinitionId = '${agentId}'  ORDER BY VersionNumber DESC LIMIT 1`
-  );
-  if (!versionResult.records.length) {
-    throw messages.createError('error.agentVersionNotFound', [apiName]);
-  }
-  const versionId = versionResult.records[0].Id;
-
-  return { agentId, versionId };
-}
-
-async function executeBatches(
-  org: Org,
-  batches: Array<EvalPayload['tests']>,
-  log: (msg: string) => void
-): Promise<unknown[]> {
-  // Pre-calculate headers once to avoid redundant API calls
-  const headers = await getApiHeaders(org);
-
-  // Execute all batches in parallel for better performance
-  if (batches.length > 1) {
-    log(messages.getMessage('info.batchProgress', [batches.length, batches.length, 'total']));
-  }
-
-  const batchPromises = batches.map(async (batch) => {
-    const batchPayload: EvalPayload = { tests: batch };
-    const resultObj = await callEvalApi(org, batchPayload, headers);
-    return resultObj.results ?? [];
-  });
-
-  const batchResults = await Promise.all(batchPromises);
-  return batchResults.flat();
-}
-
-function buildResultSummary(mergedResponse: EvalApiResponse): {
-  summary: RunEvalResult['summary'];
-  testSummaries: RunEvalResult['tests'];
-} {
-  const summary = { passed: 0, failed: 0, scored: 0, errors: 0 };
-  const testSummaries: Array<{ id: string; status: string; evaluations: unknown[]; outputs: unknown[] }> = [];
-
-  for (const testResult of mergedResponse.results ?? []) {
-    const tr = testResult as Record<string, unknown>;
-    const testId = (tr.id as string) ?? 'unknown';
-    const evalResults = (tr.evaluation_results as Array<Record<string, unknown>>) ?? [];
-    const testErrors = (tr.errors as unknown[]) ?? [];
-
-    const passed = evalResults.filter((e) => e.is_pass === true).length;
-    const failed = evalResults.filter((e) => e.is_pass === false).length;
-    const scored = evalResults.filter((e) => e.score != null && e.is_pass == null).length;
-
-    summary.passed += passed;
-    summary.failed += failed;
-    summary.scored += scored;
-    summary.errors += testErrors.length;
-
-    testSummaries.push({
-      id: testId,
-      status: failed > 0 || testErrors.length > 0 ? 'failed' : 'passed',
-      evaluations: evalResults,
-      outputs: (tr.outputs as unknown[]) ?? [],
-    });
-  }
-
-  return { summary, testSummaries };
-}
-
 export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
   public static readonly summary = messages.getMessage('summary');
   public static readonly description = messages.getMessage('description');
   public static readonly examples = messages.getMessages('examples');
-  public static state = 'beta';
-  public static readonly hidden = true;
+  public static state = 'ga';
 
   public static readonly envVariablesSection = toHelpSection(
     'ENVIRONMENT VARIABLES',
@@ -207,14 +97,10 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
 
     // If spec looks like it might be a file path (not parseable content), read the file
     try {
-      // Try to detect if it's actual content vs a file path
-      // If it's a valid YAML/JSON, it's content; otherwise treat as file path
       if (!isYamlTestSpec(rawContent)) {
         JSON.parse(rawContent);
       }
-      // If we got here, it's valid content
     } catch {
-      // Not valid content, must be a file path - read it
       try {
         rawContent = await readFile(flags.spec, 'utf-8');
       } catch (e) {
@@ -228,17 +114,14 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
     let agentApiName = flags['api-name'];
 
     if (isYamlTestSpec(rawContent)) {
-      // YAML TestSpec detected — translate to EvalPayload
       const spec = parseTestSpec(rawContent);
       payload = translateTestSpec(spec);
 
-      // Auto-infer api-name from subjectName if not explicitly provided
       if (!agentApiName) {
         agentApiName = spec.subjectName;
         this.log(messages.getMessage('info.yamlDetected', [spec.subjectName, spec.testCases.length.toString()]));
       }
     } else {
-      // JSON EvalPayload (original behavior)
       try {
         payload = JSON.parse(rawContent) as EvalPayload;
       } catch (e) {
@@ -303,16 +186,14 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
 
     const mergedResponse: EvalApiResponse = { results: allResults as EvalApiResponse['results'] };
 
-    // 9. Format output
+    // 8. Format output
     const resultFormat = (flags['result-format'] ?? 'human') as ResultFormat;
     const formatted = formatResults(mergedResponse, resultFormat);
     this.log(formatted);
 
-    // 10. Build structured result for --json
+    // 9. Build structured result for --json
     const { summary, testSummaries } = buildResultSummary(mergedResponse);
 
-    // Set exit code to 1 only for execution errors (tests couldn't run)
-    // Test failures (assertions failed) are business logic and should not affect exit code
     if (summary.errors > 0) {
       process.exitCode = 1;
     }
diff --git a/src/evalFormatter.ts b/src/evalFormatter.ts
deleted file mode 100644
index 2b965c7e..00000000
--- a/src/evalFormatter.ts
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Copyright 2026, Salesforce, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-export type ResultFormat = 'human' | 'json' | 'junit' | 'tap';
-
-type EvalOutput = {
-  type?: string;
-  id?: string;
-  session_id?: string;
-  response?: unknown;
-};
-
-type EvalResult = {
-  id?: string;
-  score?: number | null;
-  is_pass?: boolean | null;
-  actual_value?: string;
-  expected_value?: string;
-  error_message?: string;
-};
-
-type TestError = {
-  id?: string;
-  error_message?: string;
-};
-
-type TestResult = {
-  id?: string;
-  outputs?: EvalOutput[];
-  evaluation_results?: EvalResult[];
-  errors?: TestError[];
-};
-
-export type EvalApiResponse = {
-  results?: TestResult[];
-};
-
-export function formatResults(results: EvalApiResponse, format: ResultFormat): string {
-  switch (format) {
-    case 'human':
-      return formatHuman(results);
-    case 'json':
-      return JSON.stringify(results, null, 2);
-    case 'junit':
-      return formatJunit(results);
-    case 'tap':
-      return formatTap(results);
-    default:
-      return formatHuman(results);
-  }
-}
-
-// --- formatHuman helpers ---
-
-function formatOutputLines(outputs: EvalOutput[]): string[] {
-  const lines: string[] = [];
-
-  for (const output of outputs) {
-    const stepType = output.type ?? '';
-    const stepId = output.id ?? '';
-
-    if (stepType === 'agent.create_session') {
-      const sessionId = output.session_id ?? 'N/A';
-      lines.push(`- **Create Session**: ${sessionId}`);
-    } else if (stepType === 'agent.send_message') {
-      let agentMsg = output.response;
-      if (agentMsg !== null && typeof agentMsg === 'object' && !Array.isArray(agentMsg)) {
-        const msgObj = agentMsg as Record<string, unknown>;
-        const msgs = msgObj.messages as Array<Record<string, unknown>> | undefined;
-        agentMsg = msgs?.[0]?.message ?? String(agentMsg);
-      }
-      const msgStr = String(agentMsg ?? '');
-      const displayMsg = msgStr.length > 200 ? msgStr.substring(0, 200) + '...' : msgStr;
-      lines.push(`- **Agent Response** (${stepId}): ${displayMsg}`);
-    } else if (stepType === 'agent.get_state') {
-      const respData = output.response;
-      if (respData !== null && typeof respData === 'object') {
-        const resp = respData as Record<string, unknown>;
-        const planner = resp.planner_response as Record<string, unknown> | undefined;
-        const lastExec = planner?.lastExecution as Record<string, unknown> | undefined;
-        const topic = lastExec?.topic ?? 'N/A';
-        const latency = lastExec?.latency ?? 'N/A';
-        lines.push(`- **Topic Selected**: ${String(topic)}`);
-        lines.push(`- **Response Latency**: ${String(latency)}ms`);
-      } else {
-        lines.push(`- **State**: ${String(respData).substring(0, 200)}`);
-      }
-    }
-  }
-
-  return lines;
-}
-
-function formatEvaluationTable(evalResults: EvalResult[]): string[] {
-  const lines: string[] = [];
-
-  if (evalResults.length > 0) {
-    lines.push('### Evaluation Results\n');
-    lines.push('| Metric | Score | Pass | Actual | Expected |');
-    lines.push('|--------|-------|------|--------|----------|');
-
-    for (const evalR of evalResults) {
-      const metricId = evalR.id ?? 'unknown';
-      const score = evalR.score;
-      const scoreStr = score != null ? score.toFixed(3) : 'N/A';
-      const isPass = evalR.is_pass;
-      const passStr = isPass === true ? 'PASS' : isPass === false ? 'FAIL' : 'N/A';
-      const actual = String(evalR.actual_value ?? '').substring(0, 60);
-      const expected = String(evalR.expected_value ?? '').substring(0, 60);
-      const error = evalR.error_message;
-
-      if (error) {
-        lines.push(`| ${metricId} | ERROR | - | ${error.substring(0, 80)} | - |`);
-      } else {
-        lines.push(`| ${metricId} | ${scoreStr} | ${passStr} | ${actual} | ${expected} |`);
-      }
-    }
-
-    lines.push('');
-  }
-
-  return lines;
-}
-
-function formatErrorLines(errors: TestError[]): string[] {
-  const lines: string[] = [];
-
-  if (errors.length > 0) {
-    lines.push('### Errors\n');
-    for (const error of errors) {
-      const errorId = error.id ?? 'unknown';
-      const errorMsg = error.error_message ?? String(error);
-      lines.push(`- **${errorId}**: ${errorMsg}`);
-    }
-    lines.push('');
-  }
-
-  return lines;
-}
-
-function formatTestSummaryLines(evalResults: EvalResult[], errors: TestError[]): string[] {
-  const lines: string[] = [];
-
-  const totalEvals = evalResults.length;
-  const passed = evalResults.filter((e) => e.is_pass === true).length;
-  const failed = evalResults.filter((e) => e.is_pass === false).length;
-  const scored = evalResults.filter((e) => e.score != null && e.is_pass == null).length;
-
-  lines.push(`**Summary**: ${totalEvals} evaluations`);
-  if (passed || failed) {
-    lines.push(`  - Passed: ${passed}, Failed: ${failed}`);
-  }
-  if (scored) {
-    lines.push(`  - Scored (no threshold): ${scored}`);
-  }
-  if (errors.length > 0) {
-    lines.push(`  - Errors: ${errors.length}`);
-  }
-  lines.push('');
-
-  return lines;
-}
-
-function formatHuman(results: EvalApiResponse): string {
-  const lines: string[] = ['# Agent Evaluation Results\n'];
-
-  for (const testResult of results.results ?? []) {
-    const testId = testResult.id ?? 'unknown';
-    const errors = testResult.errors ?? [];
-    const evalResults = testResult.evaluation_results ?? [];
-    const outputs = testResult.outputs ?? [];
-
-    lines.push(`## Test: ${testId}\n`);
-
-    lines.push(...formatOutputLines(outputs));
-    lines.push('');
-    lines.push(...formatEvaluationTable(evalResults));
-    lines.push(...formatErrorLines(errors));
-    lines.push(...formatTestSummaryLines(evalResults, errors));
-  }
-
-  return lines.join('\n');
-}
-
-function formatJunit(results: EvalApiResponse): string {
-  const allTests: Array<{
-    name: string;
-    classname: string;
-    failed: boolean;
-    errored: boolean;
-    message: string;
-    score: string;
-  }> = [];
-
-  for (const testResult of results.results ?? []) {
-    const testId = testResult.id ?? 'unknown';
-
-    for (const evalR of testResult.evaluation_results ?? []) {
-      const stepId = evalR.id ?? 'unknown';
-      const name = `${testId}.${stepId}`;
-      const score = evalR.score;
-      const isPass = evalR.is_pass;
-      const error = evalR.error_message;
-
-      allTests.push({
-        name,
-        classname: 'agent-eval-labs',
-        failed: isPass === false,
-        errored: !!error,
-        message: error
-          ? error
-          : isPass === false
-          ? `Expected ${String(evalR.expected_value ?? '')} but got ${String(evalR.actual_value ?? '')}`
-          : '',
-        score: score != null ? score.toFixed(3) : 'N/A',
-      });
-    }
-
-    for (const err of testResult.errors ?? []) {
-      const stepId = err.id ?? 'unknown';
-      allTests.push({
-        name: `${testId}.${stepId}`,
-        classname: 'agent-eval-labs',
-        failed: false,
-        errored: true,
-        message: err.error_message ?? 'Unknown error',
-        score: 'N/A',
-      });
-    }
-  }
-
-  const totalTests = allTests.length;
-  const failures = allTests.filter((t) => t.failed).length;
-  const errors = allTests.filter((t) => t.errored).length;
-
-  const lines: string[] = [
-    '<?xml version="1.0" encoding="UTF-8"?>',
-    '<testsuites>',
-    `  <testsuite name="agent-eval-labs" tests="${totalTests}" failures="${failures}" errors="${errors}">`,
-  ];
-
-  for (const tc of allTests) {
-    lines.push(`    <testcase name="${escapeXml(tc.name)}" classname="${escapeXml(tc.classname)}">`);
-    if (tc.errored) {
-      lines.push(`      <error message="${escapeXml(tc.message)}">${escapeXml(tc.message)}</error>`);
-    } else if (tc.failed) {
-      lines.push(`      <failure message="${escapeXml(tc.message)}">Score: ${tc.score}</failure>`);
-    }
-    lines.push('    </testcase>');
-  }
-
-  lines.push('  </testsuite>');
-  lines.push('</testsuites>');
-
-  return lines.join('\n');
-}
-
-// --- formatTap helpers ---
-
-type TapEntry = {
-  ok: boolean;
-  name: string;
-  score: string;
-  expected?: string;
-  actual?: string;
-  error?: string;
-};
-
-function buildTapEntries(results: EvalApiResponse): TapEntry[] {
-  const entries: TapEntry[] = [];
-
-  for (const testResult of results.results ?? []) {
-    const testId = testResult.id ?? 'unknown';
-
-    for (const evalR of testResult.evaluation_results ?? []) {
-      const stepId = evalR.id ?? 'unknown';
-      const name = `${testId}.${stepId}`;
-      const score = evalR.score;
-      const isPass = evalR.is_pass;
-      const error = evalR.error_message;
-
-      entries.push({
-        ok: isPass !== false && !error,
-        name,
-        score: score != null ? score.toFixed(3) : 'N/A',
-        expected: evalR.expected_value != null ? String(evalR.expected_value) : undefined,
-        actual: evalR.actual_value != null ? String(evalR.actual_value) : undefined,
-        error: error ?? undefined,
-      });
-    }
-
-    for (const err of testResult.errors ?? []) {
-      const stepId = err.id ?? 'unknown';
-      entries.push({
-        ok: false,
-        name: `${testId}.${stepId}`,
-        score: 'N/A',
-        error: err.error_message ?? 'Unknown error',
-      });
-    }
-  }
-
-  return entries;
-}
-
-function formatTap(results: EvalApiResponse): string {
-  const entries = buildTapEntries(results);
-
-  const lines: string[] = ['TAP version 13', `1..${entries.length}`];
-
-  for (let i = 0; i < entries.length; i++) {
-    const e = entries[i];
-    const num = i + 1;
-    const prefix = e.ok ? 'ok' : 'not ok';
-    lines.push(`${prefix} ${num} - ${e.name} (score: ${e.score})`);
-
-    if (!e.ok) {
-      lines.push('  ---');
-      if (e.expected !== undefined) {
-        lines.push(`  expected: "${e.expected}"`);
-      }
-      if (e.actual !== undefined) {
-        lines.push(`  actual: "${e.actual}"`);
-      }
-      if (e.error) {
-        lines.push(`  error: "${e.error}"`);
-      }
-      lines.push('  ...');
-    }
-  }
-
-  return lines.join('\n');
-}
-
-function escapeXml(str: string): string {
-  return str
-    .replace(/&/g, '&amp;')
-    .replace(/</g, '&lt;')
-    .replace(/>/g, '&gt;')
-    .replace(/"/g, '&quot;')
-    .replace(/'/g, '&apos;');
-}
diff --git a/src/evalNormalizer.ts b/src/evalNormalizer.ts
deleted file mode 100644
index f9901cd1..00000000
--- a/src/evalNormalizer.ts
+++ /dev/null
@@ -1,500 +0,0 @@
-/*
- * Copyright 2026, Salesforce, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* eslint-disable camelcase */
-
-// --- Types ---
-
-export type EvalPayload = {
-  tests: EvalTest[];
-};
-
-export type EvalTest = {
-  id: string;
-  steps: EvalStep[];
-};
-
-export type EvalStep = {
-  [key: string]: unknown;
-  type: string;
-  id: string;
-};
-
-// --- Evaluator classification ---
-
-const SCORING_EVALUATORS = new Set([
-  'evaluator.text_alignment',
-  'evaluator.hallucination_detection',
-  'evaluator.citation_recall',
-  'evaluator.answer_faithfulness',
-]);
-
-const ASSERTION_EVALUATORS = new Set(['evaluator.string_assertion', 'evaluator.json_assertion']);
-
-const DEFAULT_METRIC_NAMES: Record<string, string> = {
-  'evaluator.text_alignment': 'base.cosine_similarity',
-  'evaluator.hallucination_detection': 'hallucination_detection',
-  'evaluator.citation_recall': 'citation_recall',
-  'evaluator.answer_faithfulness': 'answer_faithfulness',
-};
-
-const SCORING_VALID_FIELDS = new Set([
-  'type',
-  'id',
-  'generated_output',
-  'reference_answer',
-  'metric_name',
-  'threshold',
-]);
-
-const ASSERTION_VALID_FIELDS = new Set([
-  'type',
-  'id',
-  'actual',
-  'expected',
-  'operator',
-  'threshold',
-  'json_path',
-  'json_schema',
-  'metric_name',
-]);
-
-const VALID_AGENT_FIELDS: Record<string, Set<string>> = {
-  'agent.create_session': new Set([
-    'type',
-    'id',
-    'agent_id',
-    'agent_version_id',
-    'use_agent_api',
-    'planner_id',
-    'state',
-    'setupSessionContext',
-    'context_variables',
-  ]),
-  'agent.send_message': new Set(['type', 'id', 'session_id', 'utterance']),
-  'agent.get_state': new Set(['type', 'id', 'session_id']),
-};
-
-// --- Auto-correction maps ---
-
-const AGENT_CORRECTIONS: Record<string, string> = {
-  agentId: 'agent_id',
-  agentVersionId: 'agent_version_id',
-  sessionId: 'session_id',
-  text: 'utterance',
-  message: 'utterance',
-  input: 'utterance',
-  prompt: 'utterance',
-  user_message: 'utterance',
-  userMessage: 'utterance',
-};
-
-const EVALUATOR_CORRECTIONS: Record<string, string> = {
-  subject: 'actual',
-  expectedValue: 'expected',
-  expected_value: 'expected',
-  actualValue: 'actual',
-  actual_value: 'actual',
-  assertionType: 'operator',
-  assertion_type: 'operator',
-  comparator: 'operator',
-};
-
-// --- camelCase alias maps for agent.create_session ---
-
-const AGENT_FIELD_ALIASES: Record<string, string> = {
-  useAgentApi: 'use_agent_api',
-  plannerId: 'planner_id',
-  plannerDefinitionId: 'planner_id',
-  planner_definition_id: 'planner_id',
-  planner_version_id: 'planner_id',
-  plannerVersionId: 'planner_id',
-};
-
-// --- Scoring evaluator field aliases ---
-
-const SCORING_FIELD_ALIASES: Record<string, string> = {
-  actual: 'generated_output',
-  expected: 'reference_answer',
-  actual_value: 'generated_output',
-  expected_value: 'reference_answer',
-  actual_output: 'generated_output',
-  expected_output: 'reference_answer',
-  response: 'generated_output',
-  ground_truth: 'reference_answer',
-};
-
-// --- Assertion evaluator field aliases ---
-
-const ASSERTION_FIELD_ALIASES: Record<string, string> = {
-  actual_value: 'actual',
-  expected_value: 'expected',
-  generated_output: 'actual',
-  reference_answer: 'expected',
-  actual_output: 'actual',
-  expected_output: 'expected',
-  response: 'actual',
-  ground_truth: 'expected',
-};
-
-// --- MCP shorthand field mapping ---
-
-// MCP uses `field: "gs1.planner_state.topic"` — map to Eval API `actual` with correct JSONPath
-const MCP_FIELD_MAP: Record<string, string> = {
-  'planner_state.topic': 'response.planner_response.lastExecution.topic',
-  'planner_state.invokedActions': 'response.planner_response.lastExecution.invokedActions',
-  'planner_state.actionsSequence': 'response.planner_response.lastExecution.invokedActions',
-  response: 'response',
-  'response.messages': 'response',
-};
-
-// --- Main entry point ---
-
-/**
- * Apply all normalizations to a test payload.
- * Passes run in order: mcp-shorthand -> auto-correct -> camelCase -> evaluator fields -> shorthand refs -> defaults -> strip.
- */
-export function normalizePayload(payload: EvalPayload): EvalPayload {
-  const normalized: EvalPayload = {
-    tests: payload.tests.map((test) => {
-      let steps = [...test.steps];
-      steps = normalizeMcpShorthand(steps);
-      steps = autoCorrectFields(steps);
-      steps = normalizeCamelCase(steps);
-      steps = normalizeEvaluatorFields(steps);
-      steps = convertShorthandRefs(steps);
-      steps = injectDefaults(steps);
-      steps = stripUnrecognizedFields(steps);
-      return { ...test, steps };
-    }),
-  };
-  return normalized;
-}
-
-// --- Individual normalization passes ---
-
-/**
- * Convert MCP shorthand format to raw Eval API format.
- * MCP uses type="evaluator" + evaluator_type, raw API uses type="evaluator.xxx".
- * Also maps `field` to `actual` with proper JSONPath and auto-generates missing `id` fields.
- */
-export function normalizeMcpShorthand(steps: EvalStep[]): EvalStep[] {
-  let evalCounter = 0;
-
-  return steps.map((step) => {
-    const evaluator_type = step.evaluator_type as string | undefined;
-
-    // Only applies to MCP shorthand: type="evaluator" with evaluator_type field
-    if (step.type !== 'evaluator' || !evaluator_type) return step;
-
-    const normalized = { ...step };
-
-    // Merge type: "evaluator" + evaluator_type: "xxx" → type: "evaluator.xxx"
-    normalized.type = `evaluator.${evaluator_type}`;
-    delete normalized.evaluator_type;
-
-    // Convert `field` to `actual` with proper shorthand ref format
-    if ('field' in normalized) {
-      if (!('actual' in normalized)) {
-        const fieldValue = normalized.field as string;
-
-        // Parse "gs1.planner_state.topic" → stepId="gs1", fieldPath="planner_state.topic"
-        const dotIdx = fieldValue.indexOf('.');
-        if (dotIdx > 0) {
-          const stepId = fieldValue.substring(0, dotIdx);
-          const fieldPath = fieldValue.substring(dotIdx + 1);
-          const mappedPath = MCP_FIELD_MAP[fieldPath] ?? fieldPath;
-          normalized.actual = `{${stepId}.${mappedPath}}`;
-        } else {
-          normalized.actual = fieldValue;
-        }
-      }
-      delete normalized.field;
-    }
-
-    // Auto-generate id if missing
-    if (!normalized.id || normalized.id === '') {
-      normalized.id = `eval_${evalCounter}`;
-    }
-    evalCounter++;
-
-    return normalized as EvalStep;
-  });
-}
-
-/**
- * Auto-correct common field name mistakes.
- * Maps wrong field names to correct ones (agentId->agent_id, text->utterance, etc.)
- */
-export function autoCorrectFields(steps: EvalStep[]): EvalStep[] {
-  return steps.map((step) => {
-    const corrected = { ...step };
-    const stepType = corrected.type ?? '';
-
-    if (stepType.startsWith('agent.')) {
-      for (const [wrong, correct] of Object.entries(AGENT_CORRECTIONS)) {
-        if (wrong in corrected && !(correct in corrected)) {
-          corrected[correct] = corrected[wrong];
-          delete corrected[wrong];
-        }
-      }
-    } else if (stepType.startsWith('evaluator.')) {
-      for (const [wrong, correct] of Object.entries(EVALUATOR_CORRECTIONS)) {
-        if (wrong in corrected && !(correct in corrected)) {
-          corrected[correct] = corrected[wrong];
-          delete corrected[wrong];
-        }
-      }
-    }
-
-    return corrected as EvalStep;
-  });
-}
-
-/**
- * Normalize camelCase agent field names to snake_case.
- * useAgentApi->use_agent_api, plannerDefinitionId->planner_id, etc.
- */
-export function normalizeCamelCase(steps: EvalStep[]): EvalStep[] {
-  return steps.map((step) => {
-    if (step.type !== 'agent.create_session') return step;
-
-    const normalized = { ...step };
-    for (const [alias, canonical] of Object.entries(AGENT_FIELD_ALIASES)) {
-      if (alias in normalized) {
-        if (!(canonical in normalized)) {
-          normalized[canonical] = normalized[alias];
-        }
-        delete normalized[alias];
-      }
-    }
-    return normalized as EvalStep;
-  });
-}
-
-/**
- * Apply field aliases: remap alias keys to canonical keys, removing duplicates.
- */
-function applyFieldAliases(step: EvalStep, aliases: Record<string, string>): void {
-  for (const [alias, canonical] of Object.entries(aliases)) {
-    if (alias in step && !(canonical in step)) {
-      step[canonical] = step[alias];
-      delete step[alias];
-    } else if (alias in step && canonical in step) {
-      delete step[alias];
-    }
-  }
-}
-
-/**
- * Normalize a scoring evaluator step (field aliases + metric_name injection).
- */
-function normalizeScoringEvaluator(normalized: EvalStep, evalType: string): void {
-  applyFieldAliases(normalized, SCORING_FIELD_ALIASES);
-
-  // Auto-inject or correct metric_name
-  if (!('metric_name' in normalized)) {
-    const defaultMetric = DEFAULT_METRIC_NAMES[evalType];
-    if (defaultMetric) {
-      normalized.metric_name = defaultMetric;
-    }
-  } else if (normalized.metric_name === evalType.split('.')[1]) {
-    const defaultMetric = DEFAULT_METRIC_NAMES[evalType];
-    if (defaultMetric) {
-      normalized.metric_name = defaultMetric;
-    }
-  }
-}
-
-/**
- * Normalize an assertion evaluator step (field aliases + operator lowercase + metric_name).
- */
-function normalizeAssertionEvaluator(normalized: EvalStep, evalType: string): void {
-  applyFieldAliases(normalized, ASSERTION_FIELD_ALIASES);
-
-  // Auto-lowercase operator
-  if ('operator' in normalized && typeof normalized.operator === 'string') {
-    normalized.operator = normalized.operator.toLowerCase();
-  }
-
-  // Auto-inject metric_name for assertion evaluators
-  if (!('metric_name' in normalized)) {
-    normalized.metric_name = evalType.split('.')[1];
-  }
-}
-
-/**
- * Normalize evaluator field names based on evaluator category.
- * Maps actual/expected <-> generated_output/reference_answer.
- * Also auto-lowercases operator values and auto-injects metric_name.
- */
-export function normalizeEvaluatorFields(steps: EvalStep[]): EvalStep[] {
-  return steps.map((step) => {
-    const evalType = step.type ?? '';
-    if (!evalType.startsWith('evaluator.')) return step;
-
-    const normalized = { ...step };
-
-    if (SCORING_EVALUATORS.has(evalType)) {
-      normalizeScoringEvaluator(normalized, evalType);
-    } else if (ASSERTION_EVALUATORS.has(evalType)) {
-      normalizeAssertionEvaluator(normalized, evalType);
-    }
-    // Don't inject metric_name for unknown evaluator types to avoid API validation errors
-    // Unknown evaluators like bot_response_rating and planner_topic_assertion don't use metric_name
-
-    return normalized as EvalStep;
-  });
-}
-
-/**
- * Convert {step_id.field} shorthand references to JSONPath $.outputs[N].field.
- * Builds step_id->index mapping from non-evaluator steps.
- */
-export function convertShorthandRefs(steps: EvalStep[]): EvalStep[] {
-  // Build step_id -> output-array index mapping
-  const stepIdToIdx: Record<string, number> = {};
-  let outputIdx = 0;
-  for (const step of steps) {
-    const sid = step.id;
-    const stype = step.type ?? '';
-    if (sid && !stype.startsWith('evaluator.')) {
-      stepIdToIdx[sid] = outputIdx;
-      outputIdx += 1;
-    }
-  }
-
-  const refPattern = /\{([^}]+)\}/g;
-
-  function replaceValue(value: unknown): unknown {
-    if (typeof value !== 'string') return value;
-
-    return value.replace(refPattern, (match, ref: string) => {
-      const dotIdx = ref.indexOf('.');
-      if (dotIdx < 0) return match;
-
-      const sid = ref.substring(0, dotIdx);
-      let field = ref.substring(dotIdx + 1);
-
-      if (!(sid in stepIdToIdx)) return match;
-
-      const idx = stepIdToIdx[sid];
-
-      // Normalize legacy nested-response path to flat response
-      if (field.startsWith('response.messages')) {
-        field = 'response';
-      }
-
-      return `$.outputs[${idx}].${field}`;
-    });
-  }
-
-  return steps.map((step) => {
-    const newStep: Record<string, unknown> = {};
-    for (const [key, val] of Object.entries(step)) {
-      if (typeof val === 'string') {
-        newStep[key] = replaceValue(val);
-      } else if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
-        const newObj: Record<string, unknown> = {};
-        for (const [k, v] of Object.entries(val as Record<string, unknown>)) {
-          newObj[k] = typeof v === 'string' ? replaceValue(v) : v;
-        }
-        newStep[key] = newObj;
-      } else if (Array.isArray(val)) {
-        newStep[key] = (val as unknown[]).map((item: unknown) =>
-          typeof item === 'string' ? replaceValue(item) : item
-        );
-      } else {
-        newStep[key] = val;
-      }
-    }
-    return newStep as EvalStep;
-  });
-}
-
-/**
- * Inject default values:
- * - use_agent_api=true on agent.create_session if neither use_agent_api nor planner_id present
- */
-export function injectDefaults(steps: EvalStep[]): EvalStep[] {
-  return steps.map((step) => {
-    if (step.type === 'agent.create_session') {
-      if (!('use_agent_api' in step) && !('planner_id' in step)) {
-        return { ...step, use_agent_api: true };
-      }
-    }
-    return step;
-  });
-}
-
-/**
- * Strip unrecognized fields from steps based on type-specific whitelists.
- */
-export function stripUnrecognizedFields(steps: EvalStep[]): EvalStep[] {
-  return steps.map((step) => {
-    const stepType = step.type ?? '';
-
-    // Agent steps
-    if (stepType in VALID_AGENT_FIELDS) {
-      const validFields = VALID_AGENT_FIELDS[stepType];
-      const stripped: Record<string, unknown> = {};
-      for (const [key, val] of Object.entries(step)) {
-        if (validFields.has(key)) {
-          stripped[key] = val;
-        }
-      }
-      return stripped as EvalStep;
-    }
-
-    // Scoring evaluators
-    if (SCORING_EVALUATORS.has(stepType)) {
-      const stripped: Record<string, unknown> = {};
-      for (const [key, val] of Object.entries(step)) {
-        if (SCORING_VALID_FIELDS.has(key)) {
-          stripped[key] = val;
-        }
-      }
-      return stripped as EvalStep;
-    }
-
-    // Assertion evaluators
-    if (ASSERTION_EVALUATORS.has(stepType)) {
-      const stripped: Record<string, unknown> = {};
-      for (const [key, val] of Object.entries(step)) {
-        if (ASSERTION_VALID_FIELDS.has(key)) {
-          stripped[key] = val;
-        }
-      }
-      return stripped as EvalStep;
-    }
-
-    // Unknown types: don't strip (to avoid breaking future evaluator types)
-    return step;
-  });
-}
-
-// --- Batch splitting ---
-
-/**
- * Split tests array into chunks of batchSize.
- */
-export function splitIntoBatches(tests: EvalTest[], batchSize: number): EvalTest[][] {
-  const batches: EvalTest[][] = [];
-  for (let i = 0; i < tests.length; i += batchSize) {
-    batches.push(tests.slice(i, i + batchSize));
-  }
-  return batches;
-}
diff --git a/src/yamlSpecTranslator.ts b/src/yamlSpecTranslator.ts
deleted file mode 100644
index b19015f2..00000000
--- a/src/yamlSpecTranslator.ts
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright 2026, Salesforce, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* eslint-disable camelcase */
-
-import { parse as parseYaml } from 'yaml';
-import type { TestSpec, TestCase } from '@salesforce/agents';
-import type { EvalPayload, EvalTest, EvalStep } from './evalNormalizer.js';
-
-// --- JSONPath mappings from org model to Eval API refs ---
-
-const ACTUAL_PATH_MAP: Record<string, string> = {
-  '$.generatedData.outcome': '{sm.response}',
-  '$.generatedData.topic': '{gs.response.planner_response.lastExecution.topic}',
-  '$.generatedData.invokedActions': '{gs.response.planner_response.lastExecution.invokedActions}',
-  '$.generatedData.actionsSequence': '{gs.response.planner_response.lastExecution.invokedActions}',
-};
-
-// --- Custom evaluation name to evaluator type mapping ---
-
-const CUSTOM_EVAL_TYPE_MAP: Record<string, string> = {
-  string_comparison: 'evaluator.string_assertion',
-  numeric_comparison: 'evaluator.numeric_assertion',
-};
-
-// JSONPaths that require the get_state step
-const PLANNER_PATHS = new Set([
-  '$.generatedData.topic',
-  '$.generatedData.invokedActions',
-  '$.generatedData.actionsSequence',
-]);
-
-// --- Public API ---
-
-/**
- * Returns true if the content looks like a YAML TestSpec (has testCases + subjectName).
- * Returns false for JSON EvalPayload, invalid content, or YAML missing required fields.
- */
-export function isYamlTestSpec(content: string): boolean {
-  try {
-    const parsed: unknown = parseYaml(content);
-    if (parsed === null || typeof parsed !== 'object' || Array.isArray(parsed)) {
-      return false;
-    }
-    const obj = parsed as Record<string, unknown>;
-    return Array.isArray(obj.testCases) && typeof obj.subjectName === 'string';
-  } catch {
-    return false;
-  }
-}
-
-/**
- * Parse a YAML string into a TestSpec.
- * Throws if the content is not valid YAML or is missing required fields.
- */
-export function parseTestSpec(content: string): TestSpec {
-  const parsed: unknown = parseYaml(content);
-  if (parsed === null || typeof parsed !== 'object' || Array.isArray(parsed)) {
-    throw new Error('Invalid TestSpec: expected a YAML object');
-  }
-  const obj = parsed as Record<string, unknown>;
-  if (!Array.isArray(obj.testCases)) {
-    throw new Error('Invalid TestSpec: missing testCases array');
-  }
-  if (typeof obj.subjectName !== 'string') {
-    throw new Error('Invalid TestSpec: missing subjectName');
-  }
-  if (typeof obj.name !== 'string') {
-    throw new Error('Invalid TestSpec: missing name');
-  }
-  return parsed as TestSpec;
-}
-
-/**
- * Translate a full TestSpec into an EvalPayload.
- */
-export function translateTestSpec(spec: TestSpec): EvalPayload {
-  return {
-    tests: spec.testCases.map((tc, idx) => translateTestCase(tc, idx, spec.name)),
-  };
-}
-
-/**
- * Translate a single TestCase into an EvalTest with ordered steps.
- */
-export function translateTestCase(testCase: TestCase, index: number, specName?: string): EvalTest {
-  const id = specName ? `${specName}_case_${index}` : `test_case_${index}`;
-  const steps: EvalStep[] = [];
-
-  // 1. agent.create_session
-  const createSessionStep: EvalStep = {
-    type: 'agent.create_session',
-    id: 'cs',
-    use_agent_api: true,
-  };
-
-  if (testCase.contextVariables && testCase.contextVariables.length > 0) {
-    // Validate for duplicate names
-    const names = testCase.contextVariables.map((cv) => cv.name);
-    const duplicates = names.filter((name, idx) => names.indexOf(name) !== idx);
-    if (duplicates.length > 0) {
-      throw new Error(
-        `Duplicate contextVariable names found in test case ${index}: ${[...new Set(duplicates)].join(
-          ', '
-        )}. Each contextVariable name must be unique.`
-      );
-    }
-
-    createSessionStep.context_variables = Object.fromEntries(
-      testCase.contextVariables.map((cv) => [cv.name, cv.value])
-    );
-  }
-
-  steps.push(createSessionStep);
-
-  // 2. Conversation history — only user messages become send_message steps
-  let historyIdx = 0;
-  if (testCase.conversationHistory) {
-    for (const entry of testCase.conversationHistory) {
-      if (entry.role === 'user') {
-        steps.push({
-          type: 'agent.send_message',
-          id: `history_${historyIdx}`,
-          session_id: '{cs.session_id}',
-          utterance: entry.message,
-        });
-        historyIdx++;
-      }
-    }
-  }
-
-  // 3. Test utterance
-  steps.push({
-    type: 'agent.send_message',
-    id: 'sm',
-    session_id: '{cs.session_id}',
-    utterance: testCase.utterance,
-  });
-
-  // 4. Determine if get_state is needed
-  const needsGetState = needsPlannerState(testCase);
-  if (needsGetState) {
-    steps.push({
-      type: 'agent.get_state',
-      id: 'gs',
-      session_id: '{cs.session_id}',
-    });
-  }
-
-  // 5. Evaluators
-  if (testCase.expectedTopic !== undefined) {
-    steps.push({
-      type: 'evaluator.planner_topic_assertion',
-      id: 'check_topic',
-      expected: testCase.expectedTopic,
-      actual: '{gs.response.planner_response.lastExecution.topic}',
-      operator: 'contains',
-    });
-  }
-
-  if (testCase.expectedActions !== undefined && testCase.expectedActions.length > 0) {
-    steps.push({
-      type: 'evaluator.planner_actions_assertion',
-      id: 'check_actions',
-      expected: testCase.expectedActions,
-      actual: '{gs.response.planner_response.lastExecution.invokedActions}',
-      operator: 'includes_items',
-    });
-  }
-
-  if (testCase.expectedOutcome !== undefined) {
-    steps.push({
-      type: 'evaluator.bot_response_rating',
-      id: 'check_outcome',
-      utterance: testCase.utterance,
-      expected: testCase.expectedOutcome,
-      actual: '{sm.response}',
-      threshold: 3.0,
-    });
-  }
-
-  if (testCase.customEvaluations) {
-    testCase.customEvaluations.forEach((customEval, customIdx) => {
-      const step = translateCustomEvaluation(customEval, customIdx);
-      steps.push(step);
-    });
-  }
-
-  return { id, steps };
-}
-
-// --- Internal helpers ---
-
-/**
- * Determine whether the get_state step is needed for this test case.
- */
-function needsPlannerState(testCase: TestCase): boolean {
-  if (testCase.expectedTopic !== undefined) return true;
-  if (testCase.expectedActions !== undefined && testCase.expectedActions.length > 0) return true;
-
-  if (testCase.customEvaluations) {
-    for (const customEval of testCase.customEvaluations) {
-      for (const param of customEval.parameters) {
-        if (param.name === 'actual' && PLANNER_PATHS.has(param.value)) {
-          return true;
-        }
-      }
-    }
-  }
-
-  return false;
-}
-
-/**
- * Translate a single customEvaluation entry into an EvalStep.
- */
-function translateCustomEvaluation(
-  customEval: NonNullable<TestCase['customEvaluations']>[number],
-  index: number
-): EvalStep {
-  const evalType = CUSTOM_EVAL_TYPE_MAP[customEval.name] ?? `evaluator.${customEval.name}`;
-
-  let operator = '';
-  let actual = '';
-  let expected = '';
-
-  for (const param of customEval.parameters) {
-    if (param.name === 'operator') {
-      operator = param.value;
-    } else if (param.name === 'actual') {
-      actual = mapActualPath(param.value);
-    } else if (param.name === 'expected') {
-      expected = param.value;
-    }
-  }
-
-  return {
-    type: evalType,
-    id: `custom_${index}`,
-    operator,
-    actual,
-    expected,
-  };
-}
-
-/**
- * Map an org-model JSONPath to the Eval API shorthand ref.
- * Unknown paths are returned as-is.
- */
-function mapActualPath(path: string): string {
-  return ACTUAL_PATH_MAP[path] ?? path;
-}
diff --git a/test/commands/agent/test/run-eval.test.ts b/test/commands/agent/test/run-eval.test.ts
new file mode 100644
index 00000000..cccef486
--- /dev/null
+++ b/test/commands/agent/test/run-eval.test.ts
@@ -0,0 +1,360 @@
+/*
+ * Copyright 2026, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* eslint-disable @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-member-access,
+   @typescript-eslint/no-unsafe-call, @typescript-eslint/no-explicit-any, camelcase */
+
+import { mkdtempSync, writeFileSync, rmSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { expect } from 'chai';
+import sinon from 'sinon';
+import esmock from 'esmock';
+import { TestContext, MockTestOrgData } from '@salesforce/core/testSetup';
+
+// ─── Shared fixtures ─────────────────────────────────────────────────────────
+
+const EVAL_PAYLOAD = JSON.stringify({
+  tests: [
+    {
+      id: 'test-topic-routing',
+      steps: [
+        { type: 'agent.create_session', id: 'session' },
+        {
+          type: 'agent.send_message',
+          id: 'msg1',
+          session_id: '{session.session_id}',
+          utterance: 'What is the weather?',
+        },
+        { type: 'agent.get_state', id: 'state1', session_id: '{session.session_id}' },
+        {
+          type: 'evaluator.planner_topic_assertion',
+          id: 'check-topic',
+          actual: '{state1.response.planner_response.lastExecution.topic}',
+          expected: 'Weather_and_Temperature_Information',
+          operator: 'equals',
+        },
+      ],
+    },
+  ],
+});
+
+const YAML_SPEC = `
+name: Weather_Test
+description: Test weather agent
+subjectType: AGENT
+subjectName: Local_Info_Agent
+testCases:
+  - utterance: 'What is the weather?'
+    expectedTopic: Weather_and_Temperature_Information
+    expectedActions: []
+    expectedOutcome: 'The agent should provide weather information'
+`;
+
+const MOCK_API_RESULTS = [
+  {
+    id: 'test-topic-routing',
+    evaluation_results: [{ id: 'check-topic', is_pass: true }],
+    errors: [],
+    outputs: [],
+  },
+];
+
+// ─── Test suite ──────────────────────────────────────────────────────────────
+
+describe('agent test run-eval command', () => {
+  const $$ = new TestContext();
+  let testOrg: MockTestOrgData;
+  let tmpDir: string;
+
+  // Stubs for @salesforce/agents exports
+  let isYamlTestSpecStub: sinon.SinonStub;
+  let parseTestSpecStub: sinon.SinonStub;
+  let translateTestSpecStub: sinon.SinonStub;
+  let normalizePayloadStub: sinon.SinonStub;
+  let splitIntoBatchesStub: sinon.SinonStub;
+  let resolveAgentStub: sinon.SinonStub;
+  let executeBatchesStub: sinon.SinonStub;
+  let buildResultSummaryStub: sinon.SinonStub;
+  let formatResultsStub: sinon.SinonStub;
+
+  let AgentTestRunEval: any;
+
+  beforeEach(async () => {
+    testOrg = new MockTestOrgData();
+    await $$.stubAuths(testOrg);
+
+    tmpDir = mkdtempSync(join(tmpdir(), 'run-eval-test-'));
+
+    // Default stub implementations
+    isYamlTestSpecStub = sinon.stub().returns(false);
+    parseTestSpecStub = sinon.stub().returns({
+      name: 'Weather_Test',
+      subjectName: 'Local_Info_Agent',
+      testCases: [{ utterance: 'What is the weather?' }],
+    });
+    translateTestSpecStub = sinon.stub().returns(JSON.parse(EVAL_PAYLOAD));
+    normalizePayloadStub = sinon.stub().callsFake((p: unknown) => p);
+    splitIntoBatchesStub = sinon.stub().callsFake((tests: unknown[]) => [tests]);
+    resolveAgentStub = sinon.stub().resolves({ agentId: 'bot-001', versionId: 'ver-001' });
+    executeBatchesStub = sinon.stub().resolves(MOCK_API_RESULTS);
+    buildResultSummaryStub = sinon.stub().returns({
+      summary: { passed: 1, failed: 0, scored: 0, errors: 0 },
+      testSummaries: [{ id: 'test-topic-routing', status: 'passed', evaluations: [], outputs: [] }],
+    });
+    formatResultsStub = sinon.stub().returns('# Agent Evaluation Results');
+
+    const mod = await esmock('../../../../src/commands/agent/test/run-eval.js', {
+      '@salesforce/agents': {
+        isYamlTestSpec: isYamlTestSpecStub,
+        parseTestSpec: parseTestSpecStub,
+        translateTestSpec: translateTestSpecStub,
+        normalizePayload: normalizePayloadStub,
+        splitIntoBatches: splitIntoBatchesStub,
+        resolveAgent: resolveAgentStub,
+        executeBatches: executeBatchesStub,
+        buildResultSummary: buildResultSummaryStub,
+        formatResults: formatResultsStub,
+      },
+    });
+
+    AgentTestRunEval = mod.default;
+  });
+
+  afterEach(() => {
+    sinon.restore();
+    $$.restore();
+    rmSync(tmpDir, { recursive: true, force: true });
+  });
+
+  // ─── State ─────────────────────────────────────────────────────────────────
+
+  describe('command metadata', () => {
+    it('is marked as GA state', () => {
+      expect(AgentTestRunEval.state).to.equal('ga');
+    });
+
+    it('is not hidden', () => {
+      expect(AgentTestRunEval.hidden).to.not.equal(true);
+    });
+  });
+
+  // ─── JSON payload path ─────────────────────────────────────────────────────
+
+  describe('JSON payload', () => {
+    it('runs with an inline JSON string', async () => {
+      const result = await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]);
+
+      expect(result.summary.passed).to.equal(1);
+      expect(result.tests).to.have.length(1);
+      expect(result.tests[0].status).to.equal('passed');
+    });
+
+    it('reads the spec from a file when the string is not valid JSON', async () => {
+      const specFile = join(tmpDir, 'payload.json');
+      writeFileSync(specFile, EVAL_PAYLOAD, 'utf-8');
+
+      const result = await AgentTestRunEval.run(['--spec', specFile, '--target-org', testOrg.username]);
+
+      expect(result.summary.passed).to.equal(1);
+    });
+
+    it('throws SpecFileNotFound (exit 2) when the file does not exist', async () => {
+      try {
+        await AgentTestRunEval.run(['--spec', '/nonexistent/path.json', '--target-org', testOrg.username]);
+        expect.fail('should have thrown');
+      } catch (err: any) {
+        expect(err.exitCode).to.equal(2);
+        expect(err.name).to.equal('SpecFileNotFound');
+      }
+    });
+
+    it('calls normalizePayload by default', async () => {
+      await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]);
+
+      expect(normalizePayloadStub.calledOnce).to.be.true;
+    });
+
+    it('skips normalizePayload when --no-normalize is set', async () => {
+      await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--no-normalize', '--target-org', testOrg.username]);
+
+      expect(normalizePayloadStub.called).to.be.false;
+    });
+
+    it('resolves agent IDs when --api-name is provided', async () => {
+      await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--api-name', 'My_Agent', '--target-org', testOrg.username]);
+
+      expect(resolveAgentStub.calledOnceWith(sinon.match.any, 'My_Agent')).to.be.true;
+    });
+
+    it('throws AgentNotFound (exit 2) when resolveAgent fails', async () => {
+      resolveAgentStub.rejects(new Error('not found'));
+
+      try {
+        await AgentTestRunEval.run([
+          '--spec',
+          EVAL_PAYLOAD,
+          '--api-name',
+          'Missing_Agent',
+          '--target-org',
+          testOrg.username,
+        ]);
+        expect.fail('should have thrown');
+      } catch (err: any) {
+        expect(err.exitCode).to.equal(2);
+        expect(err.name).to.equal('AgentNotFound');
+      }
+    });
+
+    it('throws TestExecutionFailed (exit 4) when executeBatches fails', async () => {
+      executeBatchesStub.rejects(new Error('API down'));
+
+      try {
+        await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]);
+        expect.fail('should have thrown');
+      } catch (err: any) {
+        expect(err.exitCode).to.equal(4);
+        expect(err.name).to.equal('TestExecutionFailed');
+      }
+    });
+  });
+
+  // ─── YAML spec path ────────────────────────────────────────────────────────
+
+  describe('YAML spec', () => {
+    beforeEach(() => {
+      isYamlTestSpecStub.returns(true);
+    });
+
+    it('runs with an inline YAML string', async () => {
+      const result = await AgentTestRunEval.run(['--spec', YAML_SPEC, '--target-org', testOrg.username]);
+
+      expect(parseTestSpecStub.calledOnce).to.be.true;
+      expect(translateTestSpecStub.calledOnce).to.be.true;
+      expect(result.summary.passed).to.equal(1);
+    });
+
+    it('auto-infers api-name from subjectName when --api-name is omitted', async () => {
+      await AgentTestRunEval.run(['--spec', YAML_SPEC, '--target-org', testOrg.username]);
+
+      // resolveAgent should be called with the subjectName from the parsed spec
+      expect(resolveAgentStub.calledOnceWith(sinon.match.any, 'Local_Info_Agent')).to.be.true;
+    });
+
+    it('prefers explicit --api-name over auto-inferred subjectName', async () => {
+      await AgentTestRunEval.run([
+        '--spec',
+        YAML_SPEC,
+        '--api-name',
+        'Override_Agent',
+        '--target-org',
+        testOrg.username,
+      ]);
+
+      expect(resolveAgentStub.calledOnceWith(sinon.match.any, 'Override_Agent')).to.be.true;
+    });
+
+    it('reads YAML spec from a file', async () => {
+      const specFile = join(tmpDir, 'spec.yaml');
+      writeFileSync(specFile, YAML_SPEC, 'utf-8');
+
+      // isYamlTestSpec returns false for the file path string, true for the file content
+      isYamlTestSpecStub.onFirstCall().returns(false).onSecondCall().returns(true);
+
+      const result = await AgentTestRunEval.run(['--spec', specFile, '--target-org', testOrg.username]);
+
+      expect(result.summary.passed).to.equal(1);
+    });
+  });
+
+  // ─── Batch size ────────────────────────────────────────────────────────────
+
+  describe('batch size', () => {
+    it('clamps --batch-size to maximum of 5', async () => {
+      await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--batch-size', '99', '--target-org', testOrg.username]);
+
+      const batchSize = splitIntoBatchesStub.firstCall.args[1] as number;
+      expect(batchSize).to.equal(5);
+    });
+
+    it('clamps --batch-size to minimum of 1', async () => {
+      await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--batch-size', '0', '--target-org', testOrg.username]);
+
+      const batchSize = splitIntoBatchesStub.firstCall.args[1] as number;
+      expect(batchSize).to.equal(1);
+    });
+
+    it('passes through a valid --batch-size unchanged', async () => {
+      await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--batch-size', '3', '--target-org', testOrg.username]);
+
+      const batchSize = splitIntoBatchesStub.firstCall.args[1] as number;
+      expect(batchSize).to.equal(3);
+    });
+  });
+
+  // ─── Result format ─────────────────────────────────────────────────────────
+
+  describe('result format', () => {
+    it('defaults to human format', async () => {
+      await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]);
+
+      expect(formatResultsStub.calledOnceWith(sinon.match.any, 'human')).to.be.true;
+    });
+
+    it('passes --result-format tap to formatResults', async () => {
+      await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--result-format', 'tap', '--target-org', testOrg.username]);
+
+      expect(formatResultsStub.calledOnceWith(sinon.match.any, 'tap')).to.be.true;
+    });
+
+    it('passes --result-format junit to formatResults', async () => {
+      await AgentTestRunEval.run([
+        '--spec',
+        EVAL_PAYLOAD,
+        '--result-format',
+        'junit',
+        '--target-org',
+        testOrg.username,
+      ]);
+
+      expect(formatResultsStub.calledOnceWith(sinon.match.any, 'junit')).to.be.true;
+    });
+  });
+
+  // ─── Exit code behaviour ───────────────────────────────────────────────────
+
+  describe('exit code', () => {
+    it('sets process.exitCode to 1 when summary contains errors', async () => {
+      buildResultSummaryStub.returns({
+        summary: { passed: 0, failed: 0, scored: 0, errors: 2 },
+        testSummaries: [{ id: 'test-1', status: 'failed', evaluations: [], outputs: [] }],
+      });
+
+      const originalExitCode = process.exitCode;
+      await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]);
+
+      expect(process.exitCode).to.equal(1);
+      process.exitCode = originalExitCode;
+    });
+
+    it('does not set process.exitCode when there are no errors', async () => {
+      process.exitCode = undefined;
+      await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]);
+
+      expect(process.exitCode).to.be.undefined;
+    });
+  });
+});
diff --git a/test/evalFormatter.test.ts b/test/evalFormatter.test.ts
index b4b85a58..822b93eb 100644
--- a/test/evalFormatter.test.ts
+++ b/test/evalFormatter.test.ts
@@ -17,7 +17,7 @@
 /* eslint-disable camelcase */
 
 import { expect } from 'chai';
-import { formatResults, type EvalApiResponse } from '../src/evalFormatter.js';
+import { formatResults, type EvalApiResponse } from '@salesforce/agents';
 
 const MOCK_RESPONSE: EvalApiResponse = {
   results: [
diff --git a/test/evalNormalizer.test.ts b/test/evalNormalizer.test.ts
index 3ddab444..c4857d36 100644
--- a/test/evalNormalizer.test.ts
+++ b/test/evalNormalizer.test.ts
@@ -29,7 +29,7 @@ import {
   splitIntoBatches,
   type EvalStep,
   type EvalPayload,
-} from '../src/evalNormalizer.js';
+} from '@salesforce/agents';
 
 describe('evalNormalizer', () => {
   describe('normalizeMcpShorthand', () => {
diff --git a/test/yamlSpecTranslator.test.ts b/test/yamlSpecTranslator.test.ts
index 40914c67..1341fd77 100644
--- a/test/yamlSpecTranslator.test.ts
+++ b/test/yamlSpecTranslator.test.ts
@@ -18,7 +18,7 @@
 
 import { expect } from 'chai';
 import type { TestCase } from '@salesforce/agents';
-import { isYamlTestSpec, parseTestSpec, translateTestCase, translateTestSpec } from '../src/yamlSpecTranslator.js';
+import { isYamlTestSpec, parseTestSpec, translateTestCase, translateTestSpec } from '@salesforce/agents';
 
 describe('yamlSpecTranslator', () => {
   describe('isYamlTestSpec', () => {

From f48d2b0d064cdc2c778e97be66f4edf7cf78738d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Rivas?= <andresfelipe.rivas@salesforce.com>
Date: Tue, 5 May 2026 17:13:08 -0300
Subject: [PATCH 2/3] fix(agent/test/run-eval): address pre-PR review findings

- Remove state='ga' (GA commands omit state to avoid oclif help regression)
- Remove unused --wait flag
- Wrap parseTestSpec/translateTestSpec in try/catch for named SfError
- Validate test.steps is an array before injection loop
- Hoist isYamlTestSpec to avoid double-call
- Extract resolveAndInjectAgent helper to stay within complexity limit
- Fix test stubs to use $$.SANDBOX, remove sinon.restore(), fix describe label

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 messages/agent.test.run-eval.md           |   4 -
 src/commands/agent/test/run-eval.ts       | 119 +++++++++++-----------
 test/commands/agent/test/run-eval.test.ts |  31 +++---
 3 files changed, 74 insertions(+), 80 deletions(-)

diff --git a/messages/agent.test.run-eval.md b/messages/agent.test.run-eval.md
index 304ab4bd..2350c84b 100644
--- a/messages/agent.test.run-eval.md
+++ b/messages/agent.test.run-eval.md
@@ -20,10 +20,6 @@ Path to test spec file (YAML or JSON). Supports reading from stdin when piping c
 
 Agent DeveloperName (also called API name) to resolve agent_id and agent_version_id. Auto-inferred from the YAML spec's subjectName.
 
-# flags.wait.summary
-
-Number of minutes to wait for results.
-
 # flags.result-format.summary
 
 Format of the agent test results.
diff --git a/src/commands/agent/test/run-eval.ts b/src/commands/agent/test/run-eval.ts
index d0187566..c31b3815 100644
--- a/src/commands/agent/test/run-eval.ts
+++ b/src/commands/agent/test/run-eval.ts
@@ -41,11 +41,35 @@ export type RunEvalResult = {
   summary: { passed: number; failed: number; scored: number; errors: number };
 };
 
+async function resolveAndInjectAgent(
+  org: Parameters<typeof resolveAgent>[0],
+  agentApiName: string,
+  payload: EvalPayload
+): Promise<void> {
+  let agentId: string;
+  let versionId: string;
+  try {
+    ({ agentId, versionId } = await resolveAgent(org, agentApiName));
+  } catch (e) {
+    const wrapped = SfError.wrap(e);
+    throw new SfError(`Agent '${agentApiName}' not found.`, 'AgentNotFound', [], 2, wrapped);
+  }
+  for (const test of payload.tests) {
+    for (const step of test.steps) {
+      if (step.type === 'agent.create_session') {
+        // eslint-disable-next-line camelcase
+        step.agent_id = agentId;
+        // eslint-disable-next-line camelcase
+        step.agent_version_id = versionId;
+      }
+    }
+  }
+}
+
 export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
   public static readonly summary = messages.getMessage('summary');
   public static readonly description = messages.getMessage('description');
   public static readonly examples = messages.getMessages('examples');
-  public static state = 'ga';
 
   public static readonly envVariablesSection = toHelpSection(
     'ENVIRONMENT VARIABLES',
@@ -72,11 +96,6 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
       char: 'n',
       summary: messages.getMessage('flags.api-name.summary'),
     }),
-    wait: Flags.integer({
-      char: 'w',
-      default: 10,
-      summary: messages.getMessage('flags.wait.summary'),
-    }),
     'result-format': resultFormatFlag(),
     'batch-size': Flags.integer({
       default: 5,
@@ -94,32 +113,37 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
 
     // 1. Get spec content (from file or stdin via allowStdin)
     let rawContent = flags.spec;
+    let isYaml = isYamlTestSpec(rawContent);
 
-    // If spec looks like it might be a file path (not parseable content), read the file
-    try {
-      if (!isYamlTestSpec(rawContent)) {
-        JSON.parse(rawContent);
-      }
-    } catch {
+    if (!isYaml) {
       try {
-        rawContent = await readFile(flags.spec, 'utf-8');
-      } catch (e) {
-        const wrapped = SfError.wrap(e);
-        throw new SfError(`Spec file not found: ${flags.spec}`, 'SpecFileNotFound', [], 2, wrapped);
+        JSON.parse(rawContent);
+      } catch {
+        try {
+          rawContent = await readFile(flags.spec, 'utf-8');
+        } catch (e) {
+          const wrapped = SfError.wrap(e);
+          throw new SfError(`Spec file not found: ${flags.spec}`, 'SpecFileNotFound', [], 2, wrapped);
+        }
+        isYaml = isYamlTestSpec(rawContent);
       }
     }
 
     // 2. Detect format and parse
-    let payload: EvalPayload;
+    let payload!: EvalPayload;
     let agentApiName = flags['api-name'];
 
-    if (isYamlTestSpec(rawContent)) {
-      const spec = parseTestSpec(rawContent);
-      payload = translateTestSpec(spec);
+    if (isYaml) {
+      try {
+        const spec = parseTestSpec(rawContent);
+        payload = translateTestSpec(spec);
 
-      if (!agentApiName) {
-        agentApiName = spec.subjectName;
-        this.log(messages.getMessage('info.yamlDetected', [spec.subjectName, spec.testCases.length.toString()]));
+        if (!agentApiName) {
+          agentApiName = spec.subjectName;
+          this.log(messages.getMessage('info.yamlDetected', [spec.subjectName, spec.testCases.length.toString()]));
+        }
+      } catch (e) {
+        throw messages.createError('error.invalidPayload', [(e as Error).message]);
       }
     } else {
       try {
@@ -133,29 +157,15 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
       throw messages.createError('error.invalidPayload', ['missing or empty "tests" array']);
     }
 
-    // 3. If --api-name (or auto-inferred from YAML), resolve IDs and inject
-    if (agentApiName) {
-      let agentId;
-      let versionId;
-      try {
-        const resolved = await resolveAgent(org, agentApiName);
-        agentId = resolved.agentId;
-        versionId = resolved.versionId;
-      } catch (e) {
-        const wrapped = SfError.wrap(e);
-        throw new SfError(`Agent '${agentApiName}' not found.`, 'AgentNotFound', [], 2, wrapped);
+    for (const test of payload.tests) {
+      if (!Array.isArray(test.steps)) {
+        throw messages.createError('error.invalidPayload', [`test '${test.id}' has missing or invalid 'steps' array`]);
       }
+    }
 
-      for (const test of payload.tests) {
-        for (const step of test.steps) {
-          if (step.type === 'agent.create_session') {
-            // eslint-disable-next-line camelcase
-            step.agent_id = agentId;
-            // eslint-disable-next-line camelcase
-            step.agent_version_id = versionId;
-          }
-        }
-      }
+    // 3. If --api-name (or auto-inferred from YAML), resolve IDs and inject
+    if (agentApiName) {
+      await resolveAndInjectAgent(org, agentApiName, payload);
     }
 
     // 4. Normalize payload unless --no-normalize
@@ -163,17 +173,12 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
       payload = normalizePayload(payload);
     }
 
-    // 5. Clamp batch size
+    // 5. Clamp batch size and split into batches
     const batchSize = Math.min(Math.max(flags['batch-size'], 1), 5);
-
-    // 6. Split into batches
     const batches = splitIntoBatches(payload.tests, batchSize);
 
-    // 7. Execute batches
-    let allResults;
-    try {
-      allResults = await executeBatches(org, batches, (msg) => this.log(msg));
-    } catch (e) {
+    // 6. Execute batches
+    const allResults = await executeBatches(org, batches, (msg) => this.log(msg)).catch((e) => {
       const wrapped = SfError.wrap(e);
       throw new SfError(
         `Failed to execute tests: ${wrapped.message}`,
@@ -182,16 +187,14 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
         4,
         wrapped
       );
-    }
+    });
 
     const mergedResponse: EvalApiResponse = { results: allResults as EvalApiResponse['results'] };
 
-    // 8. Format output
-    const resultFormat = (flags['result-format'] ?? 'human') as ResultFormat;
-    const formatted = formatResults(mergedResponse, resultFormat);
-    this.log(formatted);
+    // 7. Format output
+    this.log(formatResults(mergedResponse, (flags['result-format'] ?? 'human') as ResultFormat));
 
-    // 9. Build structured result for --json
+    // 8. Build structured result for --json
     const { summary, testSummaries } = buildResultSummary(mergedResponse);
 
     if (summary.errors > 0) {
diff --git a/test/commands/agent/test/run-eval.test.ts b/test/commands/agent/test/run-eval.test.ts
index cccef486..8be229b9 100644
--- a/test/commands/agent/test/run-eval.test.ts
+++ b/test/commands/agent/test/run-eval.test.ts
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-/* eslint-disable @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-member-access,
-   @typescript-eslint/no-unsafe-call, @typescript-eslint/no-explicit-any, camelcase */
+/* eslint-disable @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call, @typescript-eslint/no-explicit-any, camelcase */
 
 import { mkdtempSync, writeFileSync, rmSync } from 'node:fs';
 import { tmpdir } from 'node:os';
@@ -75,7 +74,7 @@ const MOCK_API_RESULTS = [
 
 // ─── Test suite ──────────────────────────────────────────────────────────────
 
-describe('agent test run-eval command', () => {
+describe('agent test run-eval', () => {
   const $$ = new TestContext();
   let testOrg: MockTestOrgData;
   let tmpDir: string;
@@ -100,22 +99,22 @@ describe('agent test run-eval command', () => {
     tmpDir = mkdtempSync(join(tmpdir(), 'run-eval-test-'));
 
     // Default stub implementations
-    isYamlTestSpecStub = sinon.stub().returns(false);
-    parseTestSpecStub = sinon.stub().returns({
+    isYamlTestSpecStub = $$.SANDBOX.stub().returns(false);
+    parseTestSpecStub = $$.SANDBOX.stub().returns({
       name: 'Weather_Test',
       subjectName: 'Local_Info_Agent',
       testCases: [{ utterance: 'What is the weather?' }],
     });
-    translateTestSpecStub = sinon.stub().returns(JSON.parse(EVAL_PAYLOAD));
-    normalizePayloadStub = sinon.stub().callsFake((p: unknown) => p);
-    splitIntoBatchesStub = sinon.stub().callsFake((tests: unknown[]) => [tests]);
-    resolveAgentStub = sinon.stub().resolves({ agentId: 'bot-001', versionId: 'ver-001' });
-    executeBatchesStub = sinon.stub().resolves(MOCK_API_RESULTS);
-    buildResultSummaryStub = sinon.stub().returns({
+    translateTestSpecStub = $$.SANDBOX.stub().returns(JSON.parse(EVAL_PAYLOAD));
+    normalizePayloadStub = $$.SANDBOX.stub().callsFake((p: unknown) => p);
+    splitIntoBatchesStub = $$.SANDBOX.stub().callsFake((tests: unknown[]) => [tests]);
+    resolveAgentStub = $$.SANDBOX.stub().resolves({ agentId: 'bot-001', versionId: 'ver-001' });
+    executeBatchesStub = $$.SANDBOX.stub().resolves(MOCK_API_RESULTS);
+    buildResultSummaryStub = $$.SANDBOX.stub().returns({
       summary: { passed: 1, failed: 0, scored: 0, errors: 0 },
       testSummaries: [{ id: 'test-topic-routing', status: 'passed', evaluations: [], outputs: [] }],
     });
-    formatResultsStub = sinon.stub().returns('# Agent Evaluation Results');
+    formatResultsStub = $$.SANDBOX.stub().returns('# Agent Evaluation Results');
 
     const mod = await esmock('../../../../src/commands/agent/test/run-eval.js', {
       '@salesforce/agents': {
@@ -135,7 +134,6 @@ describe('agent test run-eval command', () => {
   });
 
   afterEach(() => {
-    sinon.restore();
     $$.restore();
     rmSync(tmpDir, { recursive: true, force: true });
   });
@@ -143,11 +141,8 @@ describe('agent test run-eval command', () => {
   // ─── State ─────────────────────────────────────────────────────────────────
 
   describe('command metadata', () => {
-    it('is marked as GA state', () => {
-      expect(AgentTestRunEval.state).to.equal('ga');
-    });
-
-    it('is not hidden', () => {
+    it('is not in beta or hidden state', () => {
+      expect(AgentTestRunEval.state).to.not.equal('beta');
       expect(AgentTestRunEval.hidden).to.not.equal(true);
     });
   });

From 7f9c3533bbb497ff1847c048a47fb8641f0b2519 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20Rivas?= <andresfelipe.rivas@salesforce.com>
Date: Tue, 5 May 2026 17:14:08 -0300
Subject: [PATCH 3/3] fix(agent/test/run-eval): update command snapshot to
 reflect --wait removal

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 command-snapshot.json | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/command-snapshot.json b/command-snapshot.json
index f48911bf..9f84671b 100644
--- a/command-snapshot.json
+++ b/command-snapshot.json
@@ -228,7 +228,7 @@
     "alias": [],
     "command": "agent:test:run-eval",
     "flagAliases": [],
-    "flagChars": ["n", "o", "s", "w"],
+    "flagChars": ["n", "o", "s"],
     "flags": [
       "api-name",
       "api-version",
@@ -238,8 +238,7 @@
       "no-normalize",
       "result-format",
       "spec",
-      "target-org",
-      "wait"
+      "target-org"
     ],
     "plugin": "@salesforce/plugin-agent"
   },