From e2670cf7449b835c852c4985df2b236e05707fd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Rivas?= Date: Tue, 5 May 2026 13:46:30 -0300 Subject: [PATCH 1/3] feat(agent/test/run-eval): promote command to GA and delegate logic to @salesforce/agents W-22203426 Removes local copies of evalNormalizer, evalFormatter, and yamlSpecTranslator now that they live in the shared library. Rewrites run-eval.ts as a thin CLI shell that imports normalizePayload, translateTestSpec, resolveAgent, executeBatches, buildResultSummary, and formatResults from @salesforce/agents. Promotes the command from state='beta'/hidden to state='ga'. Co-Authored-By: Claude Sonnet 4.6 --- src/commands/agent/test/run-eval.ts | 155 +------ src/evalFormatter.ts | 355 --------------- src/evalNormalizer.ts | 500 ---------------------- src/yamlSpecTranslator.ts | 265 ------------ test/commands/agent/test/run-eval.test.ts | 360 ++++++++++++++++ test/evalFormatter.test.ts | 2 +- test/evalNormalizer.test.ts | 2 +- test/yamlSpecTranslator.test.ts | 2 +- 8 files changed, 381 insertions(+), 1260 deletions(-) delete mode 100644 src/evalFormatter.ts delete mode 100644 src/evalNormalizer.ts delete mode 100644 src/yamlSpecTranslator.ts create mode 100644 test/commands/agent/test/run-eval.test.ts diff --git a/src/commands/agent/test/run-eval.ts b/src/commands/agent/test/run-eval.ts index d2569526..d0187566 100644 --- a/src/commands/agent/test/run-eval.ts +++ b/src/commands/agent/test/run-eval.ts @@ -16,11 +16,22 @@ import { readFile } from 'node:fs/promises'; import { Flags, SfCommand, toHelpSection } from '@salesforce/sf-plugins-core'; -import { EnvironmentVariable, Messages, Org, SfError } from '@salesforce/core'; -import { type EvalPayload, normalizePayload, splitIntoBatches } from '../../../evalNormalizer.js'; -import { type EvalApiResponse, formatResults, type ResultFormat } from '../../../evalFormatter.js'; +import { EnvironmentVariable, Messages, SfError } from '@salesforce/core'; +import { + type EvalPayload, + normalizePayload, + splitIntoBatches, + type EvalApiResponse, + formatResults, + type ResultFormat, + isYamlTestSpec, + parseTestSpec, + translateTestSpec, + resolveAgent, + executeBatches, + buildResultSummary, +} from '@salesforce/agents'; import { resultFormatFlag } from '../../../flags.js'; -import { isYamlTestSpec, parseTestSpec, translateTestSpec } from '../../../yamlSpecTranslator.js'; Messages.importMessagesDirectoryFromMetaUrl(import.meta.url); const messages = Messages.loadMessages('@salesforce/plugin-agent', 'agent.test.run-eval'); @@ -30,132 +41,11 @@ export type RunEvalResult = { summary: { passed: number; failed: number; scored: number; errors: number }; }; -// --- Standalone helper functions --- - -type ApiHeaders = { - orgId: string; - userId: string; - instanceUrl: string; -}; - -async function getApiHeaders(org: Org): Promise { - const conn = org.getConnection(); - const userInfo = await conn.request<{ user_id: string }>(`${conn.instanceUrl}/services/oauth2/userinfo`); - - return { - orgId: org.getOrgId(), - userId: userInfo.user_id, - instanceUrl: conn.instanceUrl, - }; -} - -async function callEvalApi(org: Org, payload: EvalPayload, headers: ApiHeaders): Promise<{ results?: unknown[] }> { - const conn = org.getConnection(); - - return conn.request<{ results?: unknown[] }>({ - url: 'https://api.salesforce.com/einstein/evaluation/v1/tests', - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'x-sfdc-core-tenant-id': `core/prod/${headers.orgId}`, - 'x-org-id': headers.orgId, - 'x-sfdc-core-instance-url': headers.instanceUrl, - 'x-sfdc-user-id': headers.userId, - 'x-client-feature-id': 'AIPlatformEvaluation', - 'x-sfdc-app-context': 'EinsteinGPT', - }, - body: JSON.stringify(payload), - }); -} - -async function resolveAgent(org: Org, apiName: string): Promise<{ agentId: string; versionId: string }> { - const conn = org.getConnection(); - - // Escape single quotes to prevent SOQL injection - const escapedApiName = apiName.replace(/'/g, "\\'"); - - const botResult = await conn.query<{ Id: string }>( - `SELECT Id FROM BotDefinition WHERE DeveloperName = '${escapedApiName}'` - ); - if (!botResult.records.length) { - throw messages.createError('error.agentNotFound', [apiName]); - } - const agentId = botResult.records[0].Id; - - // Filter to published/active versions only - const versionResult = await conn.query<{ Id: string }>( - `SELECT Id FROM BotVersion WHERE BotDefinitionId = '${agentId}' ORDER BY VersionNumber DESC LIMIT 1` - ); - if (!versionResult.records.length) { - throw messages.createError('error.agentVersionNotFound', [apiName]); - } - const versionId = versionResult.records[0].Id; - - return { agentId, versionId }; -} - -async function executeBatches( - org: Org, - batches: Array, - log: (msg: string) => void -): Promise { - // Pre-calculate headers once to avoid redundant API calls - const headers = await getApiHeaders(org); - - // Execute all batches in parallel for better performance - if (batches.length > 1) { - log(messages.getMessage('info.batchProgress', [batches.length, batches.length, 'total'])); - } - - const batchPromises = batches.map(async (batch) => { - const batchPayload: EvalPayload = { tests: batch }; - const resultObj = await callEvalApi(org, batchPayload, headers); - return resultObj.results ?? []; - }); - - const batchResults = await Promise.all(batchPromises); - return batchResults.flat(); -} - -function buildResultSummary(mergedResponse: EvalApiResponse): { - summary: RunEvalResult['summary']; - testSummaries: RunEvalResult['tests']; -} { - const summary = { passed: 0, failed: 0, scored: 0, errors: 0 }; - const testSummaries: Array<{ id: string; status: string; evaluations: unknown[]; outputs: unknown[] }> = []; - - for (const testResult of mergedResponse.results ?? []) { - const tr = testResult as Record; - const testId = (tr.id as string) ?? 'unknown'; - const evalResults = (tr.evaluation_results as Array>) ?? []; - const testErrors = (tr.errors as unknown[]) ?? []; - - const passed = evalResults.filter((e) => e.is_pass === true).length; - const failed = evalResults.filter((e) => e.is_pass === false).length; - const scored = evalResults.filter((e) => e.score != null && e.is_pass == null).length; - - summary.passed += passed; - summary.failed += failed; - summary.scored += scored; - summary.errors += testErrors.length; - - testSummaries.push({ - id: testId, - status: failed > 0 || testErrors.length > 0 ? 'failed' : 'passed', - evaluations: evalResults, - outputs: (tr.outputs as unknown[]) ?? [], - }); - } - - return { summary, testSummaries }; -} - export default class AgentTestRunEval extends SfCommand { public static readonly summary = messages.getMessage('summary'); public static readonly description = messages.getMessage('description'); public static readonly examples = messages.getMessages('examples'); - public static state = 'beta'; - public static readonly hidden = true; + public static state = 'ga'; public static readonly envVariablesSection = toHelpSection( 'ENVIRONMENT VARIABLES', @@ -207,14 +97,10 @@ export default class AgentTestRunEval extends SfCommand { // If spec looks like it might be a file path (not parseable content), read the file try { - // Try to detect if it's actual content vs a file path - // If it's a valid YAML/JSON, it's content; otherwise treat as file path if (!isYamlTestSpec(rawContent)) { JSON.parse(rawContent); } - // If we got here, it's valid content } catch { - // Not valid content, must be a file path - read it try { rawContent = await readFile(flags.spec, 'utf-8'); } catch (e) { @@ -228,17 +114,14 @@ export default class AgentTestRunEval extends SfCommand { let agentApiName = flags['api-name']; if (isYamlTestSpec(rawContent)) { - // YAML TestSpec detected — translate to EvalPayload const spec = parseTestSpec(rawContent); payload = translateTestSpec(spec); - // Auto-infer api-name from subjectName if not explicitly provided if (!agentApiName) { agentApiName = spec.subjectName; this.log(messages.getMessage('info.yamlDetected', [spec.subjectName, spec.testCases.length.toString()])); } } else { - // JSON EvalPayload (original behavior) try { payload = JSON.parse(rawContent) as EvalPayload; } catch (e) { @@ -303,16 +186,14 @@ export default class AgentTestRunEval extends SfCommand { const mergedResponse: EvalApiResponse = { results: allResults as EvalApiResponse['results'] }; - // 9. Format output + // 8. Format output const resultFormat = (flags['result-format'] ?? 'human') as ResultFormat; const formatted = formatResults(mergedResponse, resultFormat); this.log(formatted); - // 10. Build structured result for --json + // 9. Build structured result for --json const { summary, testSummaries } = buildResultSummary(mergedResponse); - // Set exit code to 1 only for execution errors (tests couldn't run) - // Test failures (assertions failed) are business logic and should not affect exit code if (summary.errors > 0) { process.exitCode = 1; } diff --git a/src/evalFormatter.ts b/src/evalFormatter.ts deleted file mode 100644 index 2b965c7e..00000000 --- a/src/evalFormatter.ts +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Copyright 2026, Salesforce, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -export type ResultFormat = 'human' | 'json' | 'junit' | 'tap'; - -type EvalOutput = { - type?: string; - id?: string; - session_id?: string; - response?: unknown; -}; - -type EvalResult = { - id?: string; - score?: number | null; - is_pass?: boolean | null; - actual_value?: string; - expected_value?: string; - error_message?: string; -}; - -type TestError = { - id?: string; - error_message?: string; -}; - -type TestResult = { - id?: string; - outputs?: EvalOutput[]; - evaluation_results?: EvalResult[]; - errors?: TestError[]; -}; - -export type EvalApiResponse = { - results?: TestResult[]; -}; - -export function formatResults(results: EvalApiResponse, format: ResultFormat): string { - switch (format) { - case 'human': - return formatHuman(results); - case 'json': - return JSON.stringify(results, null, 2); - case 'junit': - return formatJunit(results); - case 'tap': - return formatTap(results); - default: - return formatHuman(results); - } -} - -// --- formatHuman helpers --- - -function formatOutputLines(outputs: EvalOutput[]): string[] { - const lines: string[] = []; - - for (const output of outputs) { - const stepType = output.type ?? ''; - const stepId = output.id ?? ''; - - if (stepType === 'agent.create_session') { - const sessionId = output.session_id ?? 'N/A'; - lines.push(`- **Create Session**: ${sessionId}`); - } else if (stepType === 'agent.send_message') { - let agentMsg = output.response; - if (agentMsg !== null && typeof agentMsg === 'object' && !Array.isArray(agentMsg)) { - const msgObj = agentMsg as Record; - const msgs = msgObj.messages as Array> | undefined; - agentMsg = msgs?.[0]?.message ?? String(agentMsg); - } - const msgStr = String(agentMsg ?? ''); - const displayMsg = msgStr.length > 200 ? msgStr.substring(0, 200) + '...' : msgStr; - lines.push(`- **Agent Response** (${stepId}): ${displayMsg}`); - } else if (stepType === 'agent.get_state') { - const respData = output.response; - if (respData !== null && typeof respData === 'object') { - const resp = respData as Record; - const planner = resp.planner_response as Record | undefined; - const lastExec = planner?.lastExecution as Record | undefined; - const topic = lastExec?.topic ?? 'N/A'; - const latency = lastExec?.latency ?? 'N/A'; - lines.push(`- **Topic Selected**: ${String(topic)}`); - lines.push(`- **Response Latency**: ${String(latency)}ms`); - } else { - lines.push(`- **State**: ${String(respData).substring(0, 200)}`); - } - } - } - - return lines; -} - -function formatEvaluationTable(evalResults: EvalResult[]): string[] { - const lines: string[] = []; - - if (evalResults.length > 0) { - lines.push('### Evaluation Results\n'); - lines.push('| Metric | Score | Pass | Actual | Expected |'); - lines.push('|--------|-------|------|--------|----------|'); - - for (const evalR of evalResults) { - const metricId = evalR.id ?? 'unknown'; - const score = evalR.score; - const scoreStr = score != null ? score.toFixed(3) : 'N/A'; - const isPass = evalR.is_pass; - const passStr = isPass === true ? 'PASS' : isPass === false ? 'FAIL' : 'N/A'; - const actual = String(evalR.actual_value ?? '').substring(0, 60); - const expected = String(evalR.expected_value ?? '').substring(0, 60); - const error = evalR.error_message; - - if (error) { - lines.push(`| ${metricId} | ERROR | - | ${error.substring(0, 80)} | - |`); - } else { - lines.push(`| ${metricId} | ${scoreStr} | ${passStr} | ${actual} | ${expected} |`); - } - } - - lines.push(''); - } - - return lines; -} - -function formatErrorLines(errors: TestError[]): string[] { - const lines: string[] = []; - - if (errors.length > 0) { - lines.push('### Errors\n'); - for (const error of errors) { - const errorId = error.id ?? 'unknown'; - const errorMsg = error.error_message ?? String(error); - lines.push(`- **${errorId}**: ${errorMsg}`); - } - lines.push(''); - } - - return lines; -} - -function formatTestSummaryLines(evalResults: EvalResult[], errors: TestError[]): string[] { - const lines: string[] = []; - - const totalEvals = evalResults.length; - const passed = evalResults.filter((e) => e.is_pass === true).length; - const failed = evalResults.filter((e) => e.is_pass === false).length; - const scored = evalResults.filter((e) => e.score != null && e.is_pass == null).length; - - lines.push(`**Summary**: ${totalEvals} evaluations`); - if (passed || failed) { - lines.push(` - Passed: ${passed}, Failed: ${failed}`); - } - if (scored) { - lines.push(` - Scored (no threshold): ${scored}`); - } - if (errors.length > 0) { - lines.push(` - Errors: ${errors.length}`); - } - lines.push(''); - - return lines; -} - -function formatHuman(results: EvalApiResponse): string { - const lines: string[] = ['# Agent Evaluation Results\n']; - - for (const testResult of results.results ?? []) { - const testId = testResult.id ?? 'unknown'; - const errors = testResult.errors ?? []; - const evalResults = testResult.evaluation_results ?? []; - const outputs = testResult.outputs ?? []; - - lines.push(`## Test: ${testId}\n`); - - lines.push(...formatOutputLines(outputs)); - lines.push(''); - lines.push(...formatEvaluationTable(evalResults)); - lines.push(...formatErrorLines(errors)); - lines.push(...formatTestSummaryLines(evalResults, errors)); - } - - return lines.join('\n'); -} - -function formatJunit(results: EvalApiResponse): string { - const allTests: Array<{ - name: string; - classname: string; - failed: boolean; - errored: boolean; - message: string; - score: string; - }> = []; - - for (const testResult of results.results ?? []) { - const testId = testResult.id ?? 'unknown'; - - for (const evalR of testResult.evaluation_results ?? []) { - const stepId = evalR.id ?? 'unknown'; - const name = `${testId}.${stepId}`; - const score = evalR.score; - const isPass = evalR.is_pass; - const error = evalR.error_message; - - allTests.push({ - name, - classname: 'agent-eval-labs', - failed: isPass === false, - errored: !!error, - message: error - ? error - : isPass === false - ? `Expected ${String(evalR.expected_value ?? '')} but got ${String(evalR.actual_value ?? '')}` - : '', - score: score != null ? score.toFixed(3) : 'N/A', - }); - } - - for (const err of testResult.errors ?? []) { - const stepId = err.id ?? 'unknown'; - allTests.push({ - name: `${testId}.${stepId}`, - classname: 'agent-eval-labs', - failed: false, - errored: true, - message: err.error_message ?? 'Unknown error', - score: 'N/A', - }); - } - } - - const totalTests = allTests.length; - const failures = allTests.filter((t) => t.failed).length; - const errors = allTests.filter((t) => t.errored).length; - - const lines: string[] = [ - '', - '', - ` `, - ]; - - for (const tc of allTests) { - lines.push(` `); - if (tc.errored) { - lines.push(` ${escapeXml(tc.message)}`); - } else if (tc.failed) { - lines.push(` Score: ${tc.score}`); - } - lines.push(' '); - } - - lines.push(' '); - lines.push(''); - - return lines.join('\n'); -} - -// --- formatTap helpers --- - -type TapEntry = { - ok: boolean; - name: string; - score: string; - expected?: string; - actual?: string; - error?: string; -}; - -function buildTapEntries(results: EvalApiResponse): TapEntry[] { - const entries: TapEntry[] = []; - - for (const testResult of results.results ?? []) { - const testId = testResult.id ?? 'unknown'; - - for (const evalR of testResult.evaluation_results ?? []) { - const stepId = evalR.id ?? 'unknown'; - const name = `${testId}.${stepId}`; - const score = evalR.score; - const isPass = evalR.is_pass; - const error = evalR.error_message; - - entries.push({ - ok: isPass !== false && !error, - name, - score: score != null ? score.toFixed(3) : 'N/A', - expected: evalR.expected_value != null ? String(evalR.expected_value) : undefined, - actual: evalR.actual_value != null ? String(evalR.actual_value) : undefined, - error: error ?? undefined, - }); - } - - for (const err of testResult.errors ?? []) { - const stepId = err.id ?? 'unknown'; - entries.push({ - ok: false, - name: `${testId}.${stepId}`, - score: 'N/A', - error: err.error_message ?? 'Unknown error', - }); - } - } - - return entries; -} - -function formatTap(results: EvalApiResponse): string { - const entries = buildTapEntries(results); - - const lines: string[] = ['TAP version 13', `1..${entries.length}`]; - - for (let i = 0; i < entries.length; i++) { - const e = entries[i]; - const num = i + 1; - const prefix = e.ok ? 'ok' : 'not ok'; - lines.push(`${prefix} ${num} - ${e.name} (score: ${e.score})`); - - if (!e.ok) { - lines.push(' ---'); - if (e.expected !== undefined) { - lines.push(` expected: "${e.expected}"`); - } - if (e.actual !== undefined) { - lines.push(` actual: "${e.actual}"`); - } - if (e.error) { - lines.push(` error: "${e.error}"`); - } - lines.push(' ...'); - } - } - - return lines.join('\n'); -} - -function escapeXml(str: string): string { - return str - .replace(/&/g, '&') - .replace(//g, '>') - .replace(/"/g, '"') - .replace(/'/g, '''); -} diff --git a/src/evalNormalizer.ts b/src/evalNormalizer.ts deleted file mode 100644 index f9901cd1..00000000 --- a/src/evalNormalizer.ts +++ /dev/null @@ -1,500 +0,0 @@ -/* - * Copyright 2026, Salesforce, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* eslint-disable camelcase */ - -// --- Types --- - -export type EvalPayload = { - tests: EvalTest[]; -}; - -export type EvalTest = { - id: string; - steps: EvalStep[]; -}; - -export type EvalStep = { - [key: string]: unknown; - type: string; - id: string; -}; - -// --- Evaluator classification --- - -const SCORING_EVALUATORS = new Set([ - 'evaluator.text_alignment', - 'evaluator.hallucination_detection', - 'evaluator.citation_recall', - 'evaluator.answer_faithfulness', -]); - -const ASSERTION_EVALUATORS = new Set(['evaluator.string_assertion', 'evaluator.json_assertion']); - -const DEFAULT_METRIC_NAMES: Record = { - 'evaluator.text_alignment': 'base.cosine_similarity', - 'evaluator.hallucination_detection': 'hallucination_detection', - 'evaluator.citation_recall': 'citation_recall', - 'evaluator.answer_faithfulness': 'answer_faithfulness', -}; - -const SCORING_VALID_FIELDS = new Set([ - 'type', - 'id', - 'generated_output', - 'reference_answer', - 'metric_name', - 'threshold', -]); - -const ASSERTION_VALID_FIELDS = new Set([ - 'type', - 'id', - 'actual', - 'expected', - 'operator', - 'threshold', - 'json_path', - 'json_schema', - 'metric_name', -]); - -const VALID_AGENT_FIELDS: Record> = { - 'agent.create_session': new Set([ - 'type', - 'id', - 'agent_id', - 'agent_version_id', - 'use_agent_api', - 'planner_id', - 'state', - 'setupSessionContext', - 'context_variables', - ]), - 'agent.send_message': new Set(['type', 'id', 'session_id', 'utterance']), - 'agent.get_state': new Set(['type', 'id', 'session_id']), -}; - -// --- Auto-correction maps --- - -const AGENT_CORRECTIONS: Record = { - agentId: 'agent_id', - agentVersionId: 'agent_version_id', - sessionId: 'session_id', - text: 'utterance', - message: 'utterance', - input: 'utterance', - prompt: 'utterance', - user_message: 'utterance', - userMessage: 'utterance', -}; - -const EVALUATOR_CORRECTIONS: Record = { - subject: 'actual', - expectedValue: 'expected', - expected_value: 'expected', - actualValue: 'actual', - actual_value: 'actual', - assertionType: 'operator', - assertion_type: 'operator', - comparator: 'operator', -}; - -// --- camelCase alias maps for agent.create_session --- - -const AGENT_FIELD_ALIASES: Record = { - useAgentApi: 'use_agent_api', - plannerId: 'planner_id', - plannerDefinitionId: 'planner_id', - planner_definition_id: 'planner_id', - planner_version_id: 'planner_id', - plannerVersionId: 'planner_id', -}; - -// --- Scoring evaluator field aliases --- - -const SCORING_FIELD_ALIASES: Record = { - actual: 'generated_output', - expected: 'reference_answer', - actual_value: 'generated_output', - expected_value: 'reference_answer', - actual_output: 'generated_output', - expected_output: 'reference_answer', - response: 'generated_output', - ground_truth: 'reference_answer', -}; - -// --- Assertion evaluator field aliases --- - -const ASSERTION_FIELD_ALIASES: Record = { - actual_value: 'actual', - expected_value: 'expected', - generated_output: 'actual', - reference_answer: 'expected', - actual_output: 'actual', - expected_output: 'expected', - response: 'actual', - ground_truth: 'expected', -}; - -// --- MCP shorthand field mapping --- - -// MCP uses `field: "gs1.planner_state.topic"` — map to Eval API `actual` with correct JSONPath -const MCP_FIELD_MAP: Record = { - 'planner_state.topic': 'response.planner_response.lastExecution.topic', - 'planner_state.invokedActions': 'response.planner_response.lastExecution.invokedActions', - 'planner_state.actionsSequence': 'response.planner_response.lastExecution.invokedActions', - response: 'response', - 'response.messages': 'response', -}; - -// --- Main entry point --- - -/** - * Apply all normalizations to a test payload. - * Passes run in order: mcp-shorthand -> auto-correct -> camelCase -> evaluator fields -> shorthand refs -> defaults -> strip. - */ -export function normalizePayload(payload: EvalPayload): EvalPayload { - const normalized: EvalPayload = { - tests: payload.tests.map((test) => { - let steps = [...test.steps]; - steps = normalizeMcpShorthand(steps); - steps = autoCorrectFields(steps); - steps = normalizeCamelCase(steps); - steps = normalizeEvaluatorFields(steps); - steps = convertShorthandRefs(steps); - steps = injectDefaults(steps); - steps = stripUnrecognizedFields(steps); - return { ...test, steps }; - }), - }; - return normalized; -} - -// --- Individual normalization passes --- - -/** - * Convert MCP shorthand format to raw Eval API format. - * MCP uses type="evaluator" + evaluator_type, raw API uses type="evaluator.xxx". - * Also maps `field` to `actual` with proper JSONPath and auto-generates missing `id` fields. - */ -export function normalizeMcpShorthand(steps: EvalStep[]): EvalStep[] { - let evalCounter = 0; - - return steps.map((step) => { - const evaluator_type = step.evaluator_type as string | undefined; - - // Only applies to MCP shorthand: type="evaluator" with evaluator_type field - if (step.type !== 'evaluator' || !evaluator_type) return step; - - const normalized = { ...step }; - - // Merge type: "evaluator" + evaluator_type: "xxx" → type: "evaluator.xxx" - normalized.type = `evaluator.${evaluator_type}`; - delete normalized.evaluator_type; - - // Convert `field` to `actual` with proper shorthand ref format - if ('field' in normalized) { - if (!('actual' in normalized)) { - const fieldValue = normalized.field as string; - - // Parse "gs1.planner_state.topic" → stepId="gs1", fieldPath="planner_state.topic" - const dotIdx = fieldValue.indexOf('.'); - if (dotIdx > 0) { - const stepId = fieldValue.substring(0, dotIdx); - const fieldPath = fieldValue.substring(dotIdx + 1); - const mappedPath = MCP_FIELD_MAP[fieldPath] ?? fieldPath; - normalized.actual = `{${stepId}.${mappedPath}}`; - } else { - normalized.actual = fieldValue; - } - } - delete normalized.field; - } - - // Auto-generate id if missing - if (!normalized.id || normalized.id === '') { - normalized.id = `eval_${evalCounter}`; - } - evalCounter++; - - return normalized as EvalStep; - }); -} - -/** - * Auto-correct common field name mistakes. - * Maps wrong field names to correct ones (agentId->agent_id, text->utterance, etc.) - */ -export function autoCorrectFields(steps: EvalStep[]): EvalStep[] { - return steps.map((step) => { - const corrected = { ...step }; - const stepType = corrected.type ?? ''; - - if (stepType.startsWith('agent.')) { - for (const [wrong, correct] of Object.entries(AGENT_CORRECTIONS)) { - if (wrong in corrected && !(correct in corrected)) { - corrected[correct] = corrected[wrong]; - delete corrected[wrong]; - } - } - } else if (stepType.startsWith('evaluator.')) { - for (const [wrong, correct] of Object.entries(EVALUATOR_CORRECTIONS)) { - if (wrong in corrected && !(correct in corrected)) { - corrected[correct] = corrected[wrong]; - delete corrected[wrong]; - } - } - } - - return corrected as EvalStep; - }); -} - -/** - * Normalize camelCase agent field names to snake_case. - * useAgentApi->use_agent_api, plannerDefinitionId->planner_id, etc. - */ -export function normalizeCamelCase(steps: EvalStep[]): EvalStep[] { - return steps.map((step) => { - if (step.type !== 'agent.create_session') return step; - - const normalized = { ...step }; - for (const [alias, canonical] of Object.entries(AGENT_FIELD_ALIASES)) { - if (alias in normalized) { - if (!(canonical in normalized)) { - normalized[canonical] = normalized[alias]; - } - delete normalized[alias]; - } - } - return normalized as EvalStep; - }); -} - -/** - * Apply field aliases: remap alias keys to canonical keys, removing duplicates. - */ -function applyFieldAliases(step: EvalStep, aliases: Record): void { - for (const [alias, canonical] of Object.entries(aliases)) { - if (alias in step && !(canonical in step)) { - step[canonical] = step[alias]; - delete step[alias]; - } else if (alias in step && canonical in step) { - delete step[alias]; - } - } -} - -/** - * Normalize a scoring evaluator step (field aliases + metric_name injection). - */ -function normalizeScoringEvaluator(normalized: EvalStep, evalType: string): void { - applyFieldAliases(normalized, SCORING_FIELD_ALIASES); - - // Auto-inject or correct metric_name - if (!('metric_name' in normalized)) { - const defaultMetric = DEFAULT_METRIC_NAMES[evalType]; - if (defaultMetric) { - normalized.metric_name = defaultMetric; - } - } else if (normalized.metric_name === evalType.split('.')[1]) { - const defaultMetric = DEFAULT_METRIC_NAMES[evalType]; - if (defaultMetric) { - normalized.metric_name = defaultMetric; - } - } -} - -/** - * Normalize an assertion evaluator step (field aliases + operator lowercase + metric_name). - */ -function normalizeAssertionEvaluator(normalized: EvalStep, evalType: string): void { - applyFieldAliases(normalized, ASSERTION_FIELD_ALIASES); - - // Auto-lowercase operator - if ('operator' in normalized && typeof normalized.operator === 'string') { - normalized.operator = normalized.operator.toLowerCase(); - } - - // Auto-inject metric_name for assertion evaluators - if (!('metric_name' in normalized)) { - normalized.metric_name = evalType.split('.')[1]; - } -} - -/** - * Normalize evaluator field names based on evaluator category. - * Maps actual/expected <-> generated_output/reference_answer. - * Also auto-lowercases operator values and auto-injects metric_name. - */ -export function normalizeEvaluatorFields(steps: EvalStep[]): EvalStep[] { - return steps.map((step) => { - const evalType = step.type ?? ''; - if (!evalType.startsWith('evaluator.')) return step; - - const normalized = { ...step }; - - if (SCORING_EVALUATORS.has(evalType)) { - normalizeScoringEvaluator(normalized, evalType); - } else if (ASSERTION_EVALUATORS.has(evalType)) { - normalizeAssertionEvaluator(normalized, evalType); - } - // Don't inject metric_name for unknown evaluator types to avoid API validation errors - // Unknown evaluators like bot_response_rating and planner_topic_assertion don't use metric_name - - return normalized as EvalStep; - }); -} - -/** - * Convert {step_id.field} shorthand references to JSONPath $.outputs[N].field. - * Builds step_id->index mapping from non-evaluator steps. - */ -export function convertShorthandRefs(steps: EvalStep[]): EvalStep[] { - // Build step_id -> output-array index mapping - const stepIdToIdx: Record = {}; - let outputIdx = 0; - for (const step of steps) { - const sid = step.id; - const stype = step.type ?? ''; - if (sid && !stype.startsWith('evaluator.')) { - stepIdToIdx[sid] = outputIdx; - outputIdx += 1; - } - } - - const refPattern = /\{([^}]+)\}/g; - - function replaceValue(value: unknown): unknown { - if (typeof value !== 'string') return value; - - return value.replace(refPattern, (match, ref: string) => { - const dotIdx = ref.indexOf('.'); - if (dotIdx < 0) return match; - - const sid = ref.substring(0, dotIdx); - let field = ref.substring(dotIdx + 1); - - if (!(sid in stepIdToIdx)) return match; - - const idx = stepIdToIdx[sid]; - - // Normalize legacy nested-response path to flat response - if (field.startsWith('response.messages')) { - field = 'response'; - } - - return `$.outputs[${idx}].${field}`; - }); - } - - return steps.map((step) => { - const newStep: Record = {}; - for (const [key, val] of Object.entries(step)) { - if (typeof val === 'string') { - newStep[key] = replaceValue(val); - } else if (val !== null && typeof val === 'object' && !Array.isArray(val)) { - const newObj: Record = {}; - for (const [k, v] of Object.entries(val as Record)) { - newObj[k] = typeof v === 'string' ? replaceValue(v) : v; - } - newStep[key] = newObj; - } else if (Array.isArray(val)) { - newStep[key] = (val as unknown[]).map((item: unknown) => - typeof item === 'string' ? replaceValue(item) : item - ); - } else { - newStep[key] = val; - } - } - return newStep as EvalStep; - }); -} - -/** - * Inject default values: - * - use_agent_api=true on agent.create_session if neither use_agent_api nor planner_id present - */ -export function injectDefaults(steps: EvalStep[]): EvalStep[] { - return steps.map((step) => { - if (step.type === 'agent.create_session') { - if (!('use_agent_api' in step) && !('planner_id' in step)) { - return { ...step, use_agent_api: true }; - } - } - return step; - }); -} - -/** - * Strip unrecognized fields from steps based on type-specific whitelists. - */ -export function stripUnrecognizedFields(steps: EvalStep[]): EvalStep[] { - return steps.map((step) => { - const stepType = step.type ?? ''; - - // Agent steps - if (stepType in VALID_AGENT_FIELDS) { - const validFields = VALID_AGENT_FIELDS[stepType]; - const stripped: Record = {}; - for (const [key, val] of Object.entries(step)) { - if (validFields.has(key)) { - stripped[key] = val; - } - } - return stripped as EvalStep; - } - - // Scoring evaluators - if (SCORING_EVALUATORS.has(stepType)) { - const stripped: Record = {}; - for (const [key, val] of Object.entries(step)) { - if (SCORING_VALID_FIELDS.has(key)) { - stripped[key] = val; - } - } - return stripped as EvalStep; - } - - // Assertion evaluators - if (ASSERTION_EVALUATORS.has(stepType)) { - const stripped: Record = {}; - for (const [key, val] of Object.entries(step)) { - if (ASSERTION_VALID_FIELDS.has(key)) { - stripped[key] = val; - } - } - return stripped as EvalStep; - } - - // Unknown types: don't strip (to avoid breaking future evaluator types) - return step; - }); -} - -// --- Batch splitting --- - -/** - * Split tests array into chunks of batchSize. - */ -export function splitIntoBatches(tests: EvalTest[], batchSize: number): EvalTest[][] { - const batches: EvalTest[][] = []; - for (let i = 0; i < tests.length; i += batchSize) { - batches.push(tests.slice(i, i + batchSize)); - } - return batches; -} diff --git a/src/yamlSpecTranslator.ts b/src/yamlSpecTranslator.ts deleted file mode 100644 index b19015f2..00000000 --- a/src/yamlSpecTranslator.ts +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Copyright 2026, Salesforce, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* eslint-disable camelcase */ - -import { parse as parseYaml } from 'yaml'; -import type { TestSpec, TestCase } from '@salesforce/agents'; -import type { EvalPayload, EvalTest, EvalStep } from './evalNormalizer.js'; - -// --- JSONPath mappings from org model to Eval API refs --- - -const ACTUAL_PATH_MAP: Record = { - '$.generatedData.outcome': '{sm.response}', - '$.generatedData.topic': '{gs.response.planner_response.lastExecution.topic}', - '$.generatedData.invokedActions': '{gs.response.planner_response.lastExecution.invokedActions}', - '$.generatedData.actionsSequence': '{gs.response.planner_response.lastExecution.invokedActions}', -}; - -// --- Custom evaluation name to evaluator type mapping --- - -const CUSTOM_EVAL_TYPE_MAP: Record = { - string_comparison: 'evaluator.string_assertion', - numeric_comparison: 'evaluator.numeric_assertion', -}; - -// JSONPaths that require the get_state step -const PLANNER_PATHS = new Set([ - '$.generatedData.topic', - '$.generatedData.invokedActions', - '$.generatedData.actionsSequence', -]); - -// --- Public API --- - -/** - * Returns true if the content looks like a YAML TestSpec (has testCases + subjectName). - * Returns false for JSON EvalPayload, invalid content, or YAML missing required fields. - */ -export function isYamlTestSpec(content: string): boolean { - try { - const parsed: unknown = parseYaml(content); - if (parsed === null || typeof parsed !== 'object' || Array.isArray(parsed)) { - return false; - } - const obj = parsed as Record; - return Array.isArray(obj.testCases) && typeof obj.subjectName === 'string'; - } catch { - return false; - } -} - -/** - * Parse a YAML string into a TestSpec. - * Throws if the content is not valid YAML or is missing required fields. - */ -export function parseTestSpec(content: string): TestSpec { - const parsed: unknown = parseYaml(content); - if (parsed === null || typeof parsed !== 'object' || Array.isArray(parsed)) { - throw new Error('Invalid TestSpec: expected a YAML object'); - } - const obj = parsed as Record; - if (!Array.isArray(obj.testCases)) { - throw new Error('Invalid TestSpec: missing testCases array'); - } - if (typeof obj.subjectName !== 'string') { - throw new Error('Invalid TestSpec: missing subjectName'); - } - if (typeof obj.name !== 'string') { - throw new Error('Invalid TestSpec: missing name'); - } - return parsed as TestSpec; -} - -/** - * Translate a full TestSpec into an EvalPayload. - */ -export function translateTestSpec(spec: TestSpec): EvalPayload { - return { - tests: spec.testCases.map((tc, idx) => translateTestCase(tc, idx, spec.name)), - }; -} - -/** - * Translate a single TestCase into an EvalTest with ordered steps. - */ -export function translateTestCase(testCase: TestCase, index: number, specName?: string): EvalTest { - const id = specName ? `${specName}_case_${index}` : `test_case_${index}`; - const steps: EvalStep[] = []; - - // 1. agent.create_session - const createSessionStep: EvalStep = { - type: 'agent.create_session', - id: 'cs', - use_agent_api: true, - }; - - if (testCase.contextVariables && testCase.contextVariables.length > 0) { - // Validate for duplicate names - const names = testCase.contextVariables.map((cv) => cv.name); - const duplicates = names.filter((name, idx) => names.indexOf(name) !== idx); - if (duplicates.length > 0) { - throw new Error( - `Duplicate contextVariable names found in test case ${index}: ${[...new Set(duplicates)].join( - ', ' - )}. Each contextVariable name must be unique.` - ); - } - - createSessionStep.context_variables = Object.fromEntries( - testCase.contextVariables.map((cv) => [cv.name, cv.value]) - ); - } - - steps.push(createSessionStep); - - // 2. Conversation history — only user messages become send_message steps - let historyIdx = 0; - if (testCase.conversationHistory) { - for (const entry of testCase.conversationHistory) { - if (entry.role === 'user') { - steps.push({ - type: 'agent.send_message', - id: `history_${historyIdx}`, - session_id: '{cs.session_id}', - utterance: entry.message, - }); - historyIdx++; - } - } - } - - // 3. Test utterance - steps.push({ - type: 'agent.send_message', - id: 'sm', - session_id: '{cs.session_id}', - utterance: testCase.utterance, - }); - - // 4. Determine if get_state is needed - const needsGetState = needsPlannerState(testCase); - if (needsGetState) { - steps.push({ - type: 'agent.get_state', - id: 'gs', - session_id: '{cs.session_id}', - }); - } - - // 5. Evaluators - if (testCase.expectedTopic !== undefined) { - steps.push({ - type: 'evaluator.planner_topic_assertion', - id: 'check_topic', - expected: testCase.expectedTopic, - actual: '{gs.response.planner_response.lastExecution.topic}', - operator: 'contains', - }); - } - - if (testCase.expectedActions !== undefined && testCase.expectedActions.length > 0) { - steps.push({ - type: 'evaluator.planner_actions_assertion', - id: 'check_actions', - expected: testCase.expectedActions, - actual: '{gs.response.planner_response.lastExecution.invokedActions}', - operator: 'includes_items', - }); - } - - if (testCase.expectedOutcome !== undefined) { - steps.push({ - type: 'evaluator.bot_response_rating', - id: 'check_outcome', - utterance: testCase.utterance, - expected: testCase.expectedOutcome, - actual: '{sm.response}', - threshold: 3.0, - }); - } - - if (testCase.customEvaluations) { - testCase.customEvaluations.forEach((customEval, customIdx) => { - const step = translateCustomEvaluation(customEval, customIdx); - steps.push(step); - }); - } - - return { id, steps }; -} - -// --- Internal helpers --- - -/** - * Determine whether the get_state step is needed for this test case. - */ -function needsPlannerState(testCase: TestCase): boolean { - if (testCase.expectedTopic !== undefined) return true; - if (testCase.expectedActions !== undefined && testCase.expectedActions.length > 0) return true; - - if (testCase.customEvaluations) { - for (const customEval of testCase.customEvaluations) { - for (const param of customEval.parameters) { - if (param.name === 'actual' && PLANNER_PATHS.has(param.value)) { - return true; - } - } - } - } - - return false; -} - -/** - * Translate a single customEvaluation entry into an EvalStep. - */ -function translateCustomEvaluation( - customEval: NonNullable[number], - index: number -): EvalStep { - const evalType = CUSTOM_EVAL_TYPE_MAP[customEval.name] ?? `evaluator.${customEval.name}`; - - let operator = ''; - let actual = ''; - let expected = ''; - - for (const param of customEval.parameters) { - if (param.name === 'operator') { - operator = param.value; - } else if (param.name === 'actual') { - actual = mapActualPath(param.value); - } else if (param.name === 'expected') { - expected = param.value; - } - } - - return { - type: evalType, - id: `custom_${index}`, - operator, - actual, - expected, - }; -} - -/** - * Map an org-model JSONPath to the Eval API shorthand ref. - * Unknown paths are returned as-is. - */ -function mapActualPath(path: string): string { - return ACTUAL_PATH_MAP[path] ?? path; -} diff --git a/test/commands/agent/test/run-eval.test.ts b/test/commands/agent/test/run-eval.test.ts new file mode 100644 index 00000000..cccef486 --- /dev/null +++ b/test/commands/agent/test/run-eval.test.ts @@ -0,0 +1,360 @@ +/* + * Copyright 2026, Salesforce, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* eslint-disable @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-member-access, + @typescript-eslint/no-unsafe-call, @typescript-eslint/no-explicit-any, camelcase */ + +import { mkdtempSync, writeFileSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { expect } from 'chai'; +import sinon from 'sinon'; +import esmock from 'esmock'; +import { TestContext, MockTestOrgData } from '@salesforce/core/testSetup'; + +// ─── Shared fixtures ───────────────────────────────────────────────────────── + +const EVAL_PAYLOAD = JSON.stringify({ + tests: [ + { + id: 'test-topic-routing', + steps: [ + { type: 'agent.create_session', id: 'session' }, + { + type: 'agent.send_message', + id: 'msg1', + session_id: '{session.session_id}', + utterance: 'What is the weather?', + }, + { type: 'agent.get_state', id: 'state1', session_id: '{session.session_id}' }, + { + type: 'evaluator.planner_topic_assertion', + id: 'check-topic', + actual: '{state1.response.planner_response.lastExecution.topic}', + expected: 'Weather_and_Temperature_Information', + operator: 'equals', + }, + ], + }, + ], +}); + +const YAML_SPEC = ` +name: Weather_Test +description: Test weather agent +subjectType: AGENT +subjectName: Local_Info_Agent +testCases: + - utterance: 'What is the weather?' + expectedTopic: Weather_and_Temperature_Information + expectedActions: [] + expectedOutcome: 'The agent should provide weather information' +`; + +const MOCK_API_RESULTS = [ + { + id: 'test-topic-routing', + evaluation_results: [{ id: 'check-topic', is_pass: true }], + errors: [], + outputs: [], + }, +]; + +// ─── Test suite ────────────────────────────────────────────────────────────── + +describe('agent test run-eval command', () => { + const $$ = new TestContext(); + let testOrg: MockTestOrgData; + let tmpDir: string; + + // Stubs for @salesforce/agents exports + let isYamlTestSpecStub: sinon.SinonStub; + let parseTestSpecStub: sinon.SinonStub; + let translateTestSpecStub: sinon.SinonStub; + let normalizePayloadStub: sinon.SinonStub; + let splitIntoBatchesStub: sinon.SinonStub; + let resolveAgentStub: sinon.SinonStub; + let executeBatchesStub: sinon.SinonStub; + let buildResultSummaryStub: sinon.SinonStub; + let formatResultsStub: sinon.SinonStub; + + let AgentTestRunEval: any; + + beforeEach(async () => { + testOrg = new MockTestOrgData(); + await $$.stubAuths(testOrg); + + tmpDir = mkdtempSync(join(tmpdir(), 'run-eval-test-')); + + // Default stub implementations + isYamlTestSpecStub = sinon.stub().returns(false); + parseTestSpecStub = sinon.stub().returns({ + name: 'Weather_Test', + subjectName: 'Local_Info_Agent', + testCases: [{ utterance: 'What is the weather?' }], + }); + translateTestSpecStub = sinon.stub().returns(JSON.parse(EVAL_PAYLOAD)); + normalizePayloadStub = sinon.stub().callsFake((p: unknown) => p); + splitIntoBatchesStub = sinon.stub().callsFake((tests: unknown[]) => [tests]); + resolveAgentStub = sinon.stub().resolves({ agentId: 'bot-001', versionId: 'ver-001' }); + executeBatchesStub = sinon.stub().resolves(MOCK_API_RESULTS); + buildResultSummaryStub = sinon.stub().returns({ + summary: { passed: 1, failed: 0, scored: 0, errors: 0 }, + testSummaries: [{ id: 'test-topic-routing', status: 'passed', evaluations: [], outputs: [] }], + }); + formatResultsStub = sinon.stub().returns('# Agent Evaluation Results'); + + const mod = await esmock('../../../../src/commands/agent/test/run-eval.js', { + '@salesforce/agents': { + isYamlTestSpec: isYamlTestSpecStub, + parseTestSpec: parseTestSpecStub, + translateTestSpec: translateTestSpecStub, + normalizePayload: normalizePayloadStub, + splitIntoBatches: splitIntoBatchesStub, + resolveAgent: resolveAgentStub, + executeBatches: executeBatchesStub, + buildResultSummary: buildResultSummaryStub, + formatResults: formatResultsStub, + }, + }); + + AgentTestRunEval = mod.default; + }); + + afterEach(() => { + sinon.restore(); + $$.restore(); + rmSync(tmpDir, { recursive: true, force: true }); + }); + + // ─── State ───────────────────────────────────────────────────────────────── + + describe('command metadata', () => { + it('is marked as GA state', () => { + expect(AgentTestRunEval.state).to.equal('ga'); + }); + + it('is not hidden', () => { + expect(AgentTestRunEval.hidden).to.not.equal(true); + }); + }); + + // ─── JSON payload path ───────────────────────────────────────────────────── + + describe('JSON payload', () => { + it('runs with an inline JSON string', async () => { + const result = await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]); + + expect(result.summary.passed).to.equal(1); + expect(result.tests).to.have.length(1); + expect(result.tests[0].status).to.equal('passed'); + }); + + it('reads the spec from a file when the string is not valid JSON', async () => { + const specFile = join(tmpDir, 'payload.json'); + writeFileSync(specFile, EVAL_PAYLOAD, 'utf-8'); + + const result = await AgentTestRunEval.run(['--spec', specFile, '--target-org', testOrg.username]); + + expect(result.summary.passed).to.equal(1); + }); + + it('throws SpecFileNotFound (exit 2) when the file does not exist', async () => { + try { + await AgentTestRunEval.run(['--spec', '/nonexistent/path.json', '--target-org', testOrg.username]); + expect.fail('should have thrown'); + } catch (err: any) { + expect(err.exitCode).to.equal(2); + expect(err.name).to.equal('SpecFileNotFound'); + } + }); + + it('calls normalizePayload by default', async () => { + await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]); + + expect(normalizePayloadStub.calledOnce).to.be.true; + }); + + it('skips normalizePayload when --no-normalize is set', async () => { + await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--no-normalize', '--target-org', testOrg.username]); + + expect(normalizePayloadStub.called).to.be.false; + }); + + it('resolves agent IDs when --api-name is provided', async () => { + await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--api-name', 'My_Agent', '--target-org', testOrg.username]); + + expect(resolveAgentStub.calledOnceWith(sinon.match.any, 'My_Agent')).to.be.true; + }); + + it('throws AgentNotFound (exit 2) when resolveAgent fails', async () => { + resolveAgentStub.rejects(new Error('not found')); + + try { + await AgentTestRunEval.run([ + '--spec', + EVAL_PAYLOAD, + '--api-name', + 'Missing_Agent', + '--target-org', + testOrg.username, + ]); + expect.fail('should have thrown'); + } catch (err: any) { + expect(err.exitCode).to.equal(2); + expect(err.name).to.equal('AgentNotFound'); + } + }); + + it('throws TestExecutionFailed (exit 4) when executeBatches fails', async () => { + executeBatchesStub.rejects(new Error('API down')); + + try { + await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]); + expect.fail('should have thrown'); + } catch (err: any) { + expect(err.exitCode).to.equal(4); + expect(err.name).to.equal('TestExecutionFailed'); + } + }); + }); + + // ─── YAML spec path ──────────────────────────────────────────────────────── + + describe('YAML spec', () => { + beforeEach(() => { + isYamlTestSpecStub.returns(true); + }); + + it('runs with an inline YAML string', async () => { + const result = await AgentTestRunEval.run(['--spec', YAML_SPEC, '--target-org', testOrg.username]); + + expect(parseTestSpecStub.calledOnce).to.be.true; + expect(translateTestSpecStub.calledOnce).to.be.true; + expect(result.summary.passed).to.equal(1); + }); + + it('auto-infers api-name from subjectName when --api-name is omitted', async () => { + await AgentTestRunEval.run(['--spec', YAML_SPEC, '--target-org', testOrg.username]); + + // resolveAgent should be called with the subjectName from the parsed spec + expect(resolveAgentStub.calledOnceWith(sinon.match.any, 'Local_Info_Agent')).to.be.true; + }); + + it('prefers explicit --api-name over auto-inferred subjectName', async () => { + await AgentTestRunEval.run([ + '--spec', + YAML_SPEC, + '--api-name', + 'Override_Agent', + '--target-org', + testOrg.username, + ]); + + expect(resolveAgentStub.calledOnceWith(sinon.match.any, 'Override_Agent')).to.be.true; + }); + + it('reads YAML spec from a file', async () => { + const specFile = join(tmpDir, 'spec.yaml'); + writeFileSync(specFile, YAML_SPEC, 'utf-8'); + + // isYamlTestSpec returns false for the file path string, true for the file content + isYamlTestSpecStub.onFirstCall().returns(false).onSecondCall().returns(true); + + const result = await AgentTestRunEval.run(['--spec', specFile, '--target-org', testOrg.username]); + + expect(result.summary.passed).to.equal(1); + }); + }); + + // ─── Batch size ──────────────────────────────────────────────────────────── + + describe('batch size', () => { + it('clamps --batch-size to maximum of 5', async () => { + await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--batch-size', '99', '--target-org', testOrg.username]); + + const batchSize = splitIntoBatchesStub.firstCall.args[1] as number; + expect(batchSize).to.equal(5); + }); + + it('clamps --batch-size to minimum of 1', async () => { + await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--batch-size', '0', '--target-org', testOrg.username]); + + const batchSize = splitIntoBatchesStub.firstCall.args[1] as number; + expect(batchSize).to.equal(1); + }); + + it('passes through a valid --batch-size unchanged', async () => { + await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--batch-size', '3', '--target-org', testOrg.username]); + + const batchSize = splitIntoBatchesStub.firstCall.args[1] as number; + expect(batchSize).to.equal(3); + }); + }); + + // ─── Result format ───────────────────────────────────────────────────────── + + describe('result format', () => { + it('defaults to human format', async () => { + await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]); + + expect(formatResultsStub.calledOnceWith(sinon.match.any, 'human')).to.be.true; + }); + + it('passes --result-format tap to formatResults', async () => { + await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--result-format', 'tap', '--target-org', testOrg.username]); + + expect(formatResultsStub.calledOnceWith(sinon.match.any, 'tap')).to.be.true; + }); + + it('passes --result-format junit to formatResults', async () => { + await AgentTestRunEval.run([ + '--spec', + EVAL_PAYLOAD, + '--result-format', + 'junit', + '--target-org', + testOrg.username, + ]); + + expect(formatResultsStub.calledOnceWith(sinon.match.any, 'junit')).to.be.true; + }); + }); + + // ─── Exit code behaviour ─────────────────────────────────────────────────── + + describe('exit code', () => { + it('sets process.exitCode to 1 when summary contains errors', async () => { + buildResultSummaryStub.returns({ + summary: { passed: 0, failed: 0, scored: 0, errors: 2 }, + testSummaries: [{ id: 'test-1', status: 'failed', evaluations: [], outputs: [] }], + }); + + const originalExitCode = process.exitCode; + await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]); + + expect(process.exitCode).to.equal(1); + process.exitCode = originalExitCode; + }); + + it('does not set process.exitCode when there are no errors', async () => { + process.exitCode = undefined; + await AgentTestRunEval.run(['--spec', EVAL_PAYLOAD, '--target-org', testOrg.username]); + + expect(process.exitCode).to.be.undefined; + }); + }); +}); diff --git a/test/evalFormatter.test.ts b/test/evalFormatter.test.ts index b4b85a58..822b93eb 100644 --- a/test/evalFormatter.test.ts +++ b/test/evalFormatter.test.ts @@ -17,7 +17,7 @@ /* eslint-disable camelcase */ import { expect } from 'chai'; -import { formatResults, type EvalApiResponse } from '../src/evalFormatter.js'; +import { formatResults, type EvalApiResponse } from '@salesforce/agents'; const MOCK_RESPONSE: EvalApiResponse = { results: [ diff --git a/test/evalNormalizer.test.ts b/test/evalNormalizer.test.ts index 3ddab444..c4857d36 100644 --- a/test/evalNormalizer.test.ts +++ b/test/evalNormalizer.test.ts @@ -29,7 +29,7 @@ import { splitIntoBatches, type EvalStep, type EvalPayload, -} from '../src/evalNormalizer.js'; +} from '@salesforce/agents'; describe('evalNormalizer', () => { describe('normalizeMcpShorthand', () => { diff --git a/test/yamlSpecTranslator.test.ts b/test/yamlSpecTranslator.test.ts index 40914c67..1341fd77 100644 --- a/test/yamlSpecTranslator.test.ts +++ b/test/yamlSpecTranslator.test.ts @@ -18,7 +18,7 @@ import { expect } from 'chai'; import type { TestCase } from '@salesforce/agents'; -import { isYamlTestSpec, parseTestSpec, translateTestCase, translateTestSpec } from '../src/yamlSpecTranslator.js'; +import { isYamlTestSpec, parseTestSpec, translateTestCase, translateTestSpec } from '@salesforce/agents'; describe('yamlSpecTranslator', () => { describe('isYamlTestSpec', () => { From f48d2b0d064cdc2c778e97be66f4edf7cf78738d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Rivas?= Date: Tue, 5 May 2026 17:13:08 -0300 Subject: [PATCH 2/3] fix(agent/test/run-eval): address pre-PR review findings - Remove state='ga' (GA commands omit state to avoid oclif help regression) - Remove unused --wait flag - Wrap parseTestSpec/translateTestSpec in try/catch for named SfError - Validate test.steps is an array before injection loop - Hoist isYamlTestSpec to avoid double-call - Extract resolveAndInjectAgent helper to stay within complexity limit - Fix test stubs to use $$.SANDBOX, remove sinon.restore(), fix describe label Co-Authored-By: Claude Sonnet 4.6 --- messages/agent.test.run-eval.md | 4 - src/commands/agent/test/run-eval.ts | 119 +++++++++++----------- test/commands/agent/test/run-eval.test.ts | 31 +++--- 3 files changed, 74 insertions(+), 80 deletions(-) diff --git a/messages/agent.test.run-eval.md b/messages/agent.test.run-eval.md index 304ab4bd..2350c84b 100644 --- a/messages/agent.test.run-eval.md +++ b/messages/agent.test.run-eval.md @@ -20,10 +20,6 @@ Path to test spec file (YAML or JSON). Supports reading from stdin when piping c Agent DeveloperName (also called API name) to resolve agent_id and agent_version_id. Auto-inferred from the YAML spec's subjectName. -# flags.wait.summary - -Number of minutes to wait for results. - # flags.result-format.summary Format of the agent test results. diff --git a/src/commands/agent/test/run-eval.ts b/src/commands/agent/test/run-eval.ts index d0187566..c31b3815 100644 --- a/src/commands/agent/test/run-eval.ts +++ b/src/commands/agent/test/run-eval.ts @@ -41,11 +41,35 @@ export type RunEvalResult = { summary: { passed: number; failed: number; scored: number; errors: number }; }; +async function resolveAndInjectAgent( + org: Parameters[0], + agentApiName: string, + payload: EvalPayload +): Promise { + let agentId: string; + let versionId: string; + try { + ({ agentId, versionId } = await resolveAgent(org, agentApiName)); + } catch (e) { + const wrapped = SfError.wrap(e); + throw new SfError(`Agent '${agentApiName}' not found.`, 'AgentNotFound', [], 2, wrapped); + } + for (const test of payload.tests) { + for (const step of test.steps) { + if (step.type === 'agent.create_session') { + // eslint-disable-next-line camelcase + step.agent_id = agentId; + // eslint-disable-next-line camelcase + step.agent_version_id = versionId; + } + } + } +} + export default class AgentTestRunEval extends SfCommand { public static readonly summary = messages.getMessage('summary'); public static readonly description = messages.getMessage('description'); public static readonly examples = messages.getMessages('examples'); - public static state = 'ga'; public static readonly envVariablesSection = toHelpSection( 'ENVIRONMENT VARIABLES', @@ -72,11 +96,6 @@ export default class AgentTestRunEval extends SfCommand { char: 'n', summary: messages.getMessage('flags.api-name.summary'), }), - wait: Flags.integer({ - char: 'w', - default: 10, - summary: messages.getMessage('flags.wait.summary'), - }), 'result-format': resultFormatFlag(), 'batch-size': Flags.integer({ default: 5, @@ -94,32 +113,37 @@ export default class AgentTestRunEval extends SfCommand { // 1. Get spec content (from file or stdin via allowStdin) let rawContent = flags.spec; + let isYaml = isYamlTestSpec(rawContent); - // If spec looks like it might be a file path (not parseable content), read the file - try { - if (!isYamlTestSpec(rawContent)) { - JSON.parse(rawContent); - } - } catch { + if (!isYaml) { try { - rawContent = await readFile(flags.spec, 'utf-8'); - } catch (e) { - const wrapped = SfError.wrap(e); - throw new SfError(`Spec file not found: ${flags.spec}`, 'SpecFileNotFound', [], 2, wrapped); + JSON.parse(rawContent); + } catch { + try { + rawContent = await readFile(flags.spec, 'utf-8'); + } catch (e) { + const wrapped = SfError.wrap(e); + throw new SfError(`Spec file not found: ${flags.spec}`, 'SpecFileNotFound', [], 2, wrapped); + } + isYaml = isYamlTestSpec(rawContent); } } // 2. Detect format and parse - let payload: EvalPayload; + let payload!: EvalPayload; let agentApiName = flags['api-name']; - if (isYamlTestSpec(rawContent)) { - const spec = parseTestSpec(rawContent); - payload = translateTestSpec(spec); + if (isYaml) { + try { + const spec = parseTestSpec(rawContent); + payload = translateTestSpec(spec); - if (!agentApiName) { - agentApiName = spec.subjectName; - this.log(messages.getMessage('info.yamlDetected', [spec.subjectName, spec.testCases.length.toString()])); + if (!agentApiName) { + agentApiName = spec.subjectName; + this.log(messages.getMessage('info.yamlDetected', [spec.subjectName, spec.testCases.length.toString()])); + } + } catch (e) { + throw messages.createError('error.invalidPayload', [(e as Error).message]); } } else { try { @@ -133,29 +157,15 @@ export default class AgentTestRunEval extends SfCommand { throw messages.createError('error.invalidPayload', ['missing or empty "tests" array']); } - // 3. If --api-name (or auto-inferred from YAML), resolve IDs and inject - if (agentApiName) { - let agentId; - let versionId; - try { - const resolved = await resolveAgent(org, agentApiName); - agentId = resolved.agentId; - versionId = resolved.versionId; - } catch (e) { - const wrapped = SfError.wrap(e); - throw new SfError(`Agent '${agentApiName}' not found.`, 'AgentNotFound', [], 2, wrapped); + for (const test of payload.tests) { + if (!Array.isArray(test.steps)) { + throw messages.createError('error.invalidPayload', [`test '${test.id}' has missing or invalid 'steps' array`]); } + } - for (const test of payload.tests) { - for (const step of test.steps) { - if (step.type === 'agent.create_session') { - // eslint-disable-next-line camelcase - step.agent_id = agentId; - // eslint-disable-next-line camelcase - step.agent_version_id = versionId; - } - } - } + // 3. If --api-name (or auto-inferred from YAML), resolve IDs and inject + if (agentApiName) { + await resolveAndInjectAgent(org, agentApiName, payload); } // 4. Normalize payload unless --no-normalize @@ -163,17 +173,12 @@ export default class AgentTestRunEval extends SfCommand { payload = normalizePayload(payload); } - // 5. Clamp batch size + // 5. Clamp batch size and split into batches const batchSize = Math.min(Math.max(flags['batch-size'], 1), 5); - - // 6. Split into batches const batches = splitIntoBatches(payload.tests, batchSize); - // 7. Execute batches - let allResults; - try { - allResults = await executeBatches(org, batches, (msg) => this.log(msg)); - } catch (e) { + // 6. Execute batches + const allResults = await executeBatches(org, batches, (msg) => this.log(msg)).catch((e) => { const wrapped = SfError.wrap(e); throw new SfError( `Failed to execute tests: ${wrapped.message}`, @@ -182,16 +187,14 @@ export default class AgentTestRunEval extends SfCommand { 4, wrapped ); - } + }); const mergedResponse: EvalApiResponse = { results: allResults as EvalApiResponse['results'] }; - // 8. Format output - const resultFormat = (flags['result-format'] ?? 'human') as ResultFormat; - const formatted = formatResults(mergedResponse, resultFormat); - this.log(formatted); + // 7. Format output + this.log(formatResults(mergedResponse, (flags['result-format'] ?? 'human') as ResultFormat)); - // 9. Build structured result for --json + // 8. Build structured result for --json const { summary, testSummaries } = buildResultSummary(mergedResponse); if (summary.errors > 0) { diff --git a/test/commands/agent/test/run-eval.test.ts b/test/commands/agent/test/run-eval.test.ts index cccef486..8be229b9 100644 --- a/test/commands/agent/test/run-eval.test.ts +++ b/test/commands/agent/test/run-eval.test.ts @@ -14,8 +14,7 @@ * limitations under the License. */ -/* eslint-disable @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-member-access, - @typescript-eslint/no-unsafe-call, @typescript-eslint/no-explicit-any, camelcase */ +/* eslint-disable @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call, @typescript-eslint/no-explicit-any, camelcase */ import { mkdtempSync, writeFileSync, rmSync } from 'node:fs'; import { tmpdir } from 'node:os'; @@ -75,7 +74,7 @@ const MOCK_API_RESULTS = [ // ─── Test suite ────────────────────────────────────────────────────────────── -describe('agent test run-eval command', () => { +describe('agent test run-eval', () => { const $$ = new TestContext(); let testOrg: MockTestOrgData; let tmpDir: string; @@ -100,22 +99,22 @@ describe('agent test run-eval command', () => { tmpDir = mkdtempSync(join(tmpdir(), 'run-eval-test-')); // Default stub implementations - isYamlTestSpecStub = sinon.stub().returns(false); - parseTestSpecStub = sinon.stub().returns({ + isYamlTestSpecStub = $$.SANDBOX.stub().returns(false); + parseTestSpecStub = $$.SANDBOX.stub().returns({ name: 'Weather_Test', subjectName: 'Local_Info_Agent', testCases: [{ utterance: 'What is the weather?' }], }); - translateTestSpecStub = sinon.stub().returns(JSON.parse(EVAL_PAYLOAD)); - normalizePayloadStub = sinon.stub().callsFake((p: unknown) => p); - splitIntoBatchesStub = sinon.stub().callsFake((tests: unknown[]) => [tests]); - resolveAgentStub = sinon.stub().resolves({ agentId: 'bot-001', versionId: 'ver-001' }); - executeBatchesStub = sinon.stub().resolves(MOCK_API_RESULTS); - buildResultSummaryStub = sinon.stub().returns({ + translateTestSpecStub = $$.SANDBOX.stub().returns(JSON.parse(EVAL_PAYLOAD)); + normalizePayloadStub = $$.SANDBOX.stub().callsFake((p: unknown) => p); + splitIntoBatchesStub = $$.SANDBOX.stub().callsFake((tests: unknown[]) => [tests]); + resolveAgentStub = $$.SANDBOX.stub().resolves({ agentId: 'bot-001', versionId: 'ver-001' }); + executeBatchesStub = $$.SANDBOX.stub().resolves(MOCK_API_RESULTS); + buildResultSummaryStub = $$.SANDBOX.stub().returns({ summary: { passed: 1, failed: 0, scored: 0, errors: 0 }, testSummaries: [{ id: 'test-topic-routing', status: 'passed', evaluations: [], outputs: [] }], }); - formatResultsStub = sinon.stub().returns('# Agent Evaluation Results'); + formatResultsStub = $$.SANDBOX.stub().returns('# Agent Evaluation Results'); const mod = await esmock('../../../../src/commands/agent/test/run-eval.js', { '@salesforce/agents': { @@ -135,7 +134,6 @@ describe('agent test run-eval command', () => { }); afterEach(() => { - sinon.restore(); $$.restore(); rmSync(tmpDir, { recursive: true, force: true }); }); @@ -143,11 +141,8 @@ describe('agent test run-eval command', () => { // ─── State ───────────────────────────────────────────────────────────────── describe('command metadata', () => { - it('is marked as GA state', () => { - expect(AgentTestRunEval.state).to.equal('ga'); - }); - - it('is not hidden', () => { + it('is not in beta or hidden state', () => { + expect(AgentTestRunEval.state).to.not.equal('beta'); expect(AgentTestRunEval.hidden).to.not.equal(true); }); }); From 7f9c3533bbb497ff1847c048a47fb8641f0b2519 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Rivas?= Date: Tue, 5 May 2026 17:14:08 -0300 Subject: [PATCH 3/3] fix(agent/test/run-eval): update command snapshot to reflect --wait removal Co-Authored-By: Claude Sonnet 4.6 --- command-snapshot.json | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/command-snapshot.json b/command-snapshot.json index f48911bf..9f84671b 100644 --- a/command-snapshot.json +++ b/command-snapshot.json @@ -228,7 +228,7 @@ "alias": [], "command": "agent:test:run-eval", "flagAliases": [], - "flagChars": ["n", "o", "s", "w"], + "flagChars": ["n", "o", "s"], "flags": [ "api-name", "api-version", @@ -238,8 +238,7 @@ "no-normalize", "result-format", "spec", - "target-org", - "wait" + "target-org" ], "plugin": "@salesforce/plugin-agent" },