diff --git a/messages/agent.generate.test-spec.md b/messages/agent.generate.test-spec.md index abbcc96c..effb107c 100644 --- a/messages/agent.generate.test-spec.md +++ b/messages/agent.generate.test-spec.md @@ -15,6 +15,8 @@ To generate a specific agent test case, this command prompts you for this inform - (Optional) Custom evaluation: Test an agent's response for specific strings or numbers. - (Optional) Conversation history: Boilerplate for additional context you can add to the test in the form of a conversation history. +You can manually add contextVariables to test cases in the generated YAML file to inject contextual data (such as CaseId or RoutableId) into agent sessions. This is useful for testing agent behavior with different contextual information. + When your test spec is ready, you then run the "agent test create" command to actually create the test in your org and synchronize the metadata with your DX project. The metadata type for an agent test is AiEvaluationDefinition. If you have an existing AiEvaluationDefinition metadata XML file in your DX project, you can generate its equivalent YAML test spec file with the --from-definition flag. diff --git a/messages/agent.test.run-eval.md b/messages/agent.test.run-eval.md index 356e2f98..304ab4bd 100644 --- a/messages/agent.test.run-eval.md +++ b/messages/agent.test.run-eval.md @@ -6,15 +6,15 @@ Run evaluation tests against an Agentforce agent. Execute rich evaluation tests against an Agentforce agent using the Einstein Evaluation API. Supports both YAML test specs (same format as `sf agent generate test-spec`) and JSON payloads. -When you provide a YAML test spec, the command automatically translates test cases into Evaluation API calls and infers the agent name from the spec's `subjectName` field. This means you can use the same test spec with both `sf agent test run` and `sf agent test run-eval`. +When you provide a YAML test spec, the command automatically translates test cases into Evaluation API calls and infers the agent name from the spec's `subjectName` field. This means you can use the same test spec with both `sf agent test run` and `sf agent test run-eval`. YAML test specs also support contextVariables, which allow you to inject contextual data (such as CaseId or RoutableId) into agent sessions for testing with different contexts. -When you provide a JSON payload, it's sent directly to the API with optional normalization. The normalizer auto-corrects common field name mistakes, converts shorthand references to JSONPath, and injects defaults. Use `--no-normalize` to disable this auto-normalization. +When you provide a JSON payload, it's sent directly to the API with optional normalization. The normalizer auto-corrects common field name mistakes, converts shorthand references to JSONPath, and injects defaults. Use `--no-normalize` to disable this auto-normalization. JSON payloads can also include context_variables on agent.create_session steps for the same contextual testing capabilities. Supports 8+ evaluator types, including topic routing assertions, action invocation checks, string/numeric assertions, semantic similarity scoring, and LLM-based quality ratings. # flags.spec.summary -Path to test spec file (YAML or JSON). Use `-` for stdin. +Path to test spec file (YAML or JSON). Supports reading from stdin when piping content. # flags.api-name.summary @@ -54,9 +54,13 @@ Disable auto-normalization of field names and shorthand references. <%= config.bin %> <%= command.id %> --spec tests/my-agent-testSpec.yaml --target-org my-org --result-format junit -- Pipe JSON payload from stdin: +- Run tests with contextVariables to inject contextual data into agent sessions (add contextVariables to test cases in your YAML spec): - $ echo '{"tests":[...]}' | <%= config.bin %> <%= command.id %> --spec - --target-org my-org + <%= config.bin %> <%= command.id %> --spec tests/agent-with-context.yaml --target-org my-org + +- Pipe JSON payload from stdin (--spec flag is automatically populated from stdin): + + $ echo '{"tests":[...]}' | <%= config.bin %> <%= command.id %> --spec --target-org my-org # info.batchProgress diff --git a/schemas/agent-test-run__eval.json b/schemas/agent-test-run__eval.json index 351919a2..f4094406 100644 --- a/schemas/agent-test-run__eval.json +++ b/schemas/agent-test-run__eval.json @@ -19,9 +19,13 @@ "evaluations": { "type": "array", "items": {} + }, + "outputs": { + "type": "array", + "items": {} } }, - "required": ["id", "status", "evaluations"], + "required": ["id", "status", "evaluations", "outputs"], "additionalProperties": false } }, diff --git a/src/commands/agent/test/run-eval.ts b/src/commands/agent/test/run-eval.ts index 0c176f47..f364c434 100644 --- a/src/commands/agent/test/run-eval.ts +++ b/src/commands/agent/test/run-eval.ts @@ -26,7 +26,7 @@ Messages.importMessagesDirectoryFromMetaUrl(import.meta.url); const messages = Messages.loadMessages('@salesforce/plugin-agent', 'agent.test.run-eval'); export type RunEvalResult = { - tests: Array<{ id: string; status: string; evaluations: unknown[] }>; + tests: Array<{ id: string; status: string; evaluations: unknown[]; outputs: unknown[] }>; summary: { passed: number; failed: number; scored: number; errors: number }; }; @@ -122,7 +122,7 @@ function buildResultSummary(mergedResponse: EvalApiResponse): { testSummaries: RunEvalResult['tests']; } { const summary = { passed: 0, failed: 0, scored: 0, errors: 0 }; - const testSummaries: Array<{ id: string; status: string; evaluations: unknown[] }> = []; + const testSummaries: Array<{ id: string; status: string; evaluations: unknown[]; outputs: unknown[] }> = []; for (const testResult of mergedResponse.results ?? []) { const tr = testResult as Record; @@ -143,6 +143,7 @@ function buildResultSummary(mergedResponse: EvalApiResponse): { id: testId, status: failed > 0 || testErrors.length > 0 ? 'failed' : 'passed', evaluations: evalResults, + outputs: (tr.outputs as unknown[]) ?? [], }); } diff --git a/src/evalNormalizer.ts b/src/evalNormalizer.ts index 7d2a1921..f9901cd1 100644 --- a/src/evalNormalizer.ts +++ b/src/evalNormalizer.ts @@ -73,7 +73,17 @@ const ASSERTION_VALID_FIELDS = new Set([ ]); const VALID_AGENT_FIELDS: Record> = { - 'agent.create_session': new Set(['type', 'id', 'agent_id', 'agent_version_id', 'use_agent_api', 'planner_id']), + 'agent.create_session': new Set([ + 'type', + 'id', + 'agent_id', + 'agent_version_id', + 'use_agent_api', + 'planner_id', + 'state', + 'setupSessionContext', + 'context_variables', + ]), 'agent.send_message': new Set(['type', 'id', 'session_id', 'utterance']), 'agent.get_state': new Set(['type', 'id', 'session_id']), }; diff --git a/src/yamlSpecTranslator.ts b/src/yamlSpecTranslator.ts index 6c3fb862..b19015f2 100644 --- a/src/yamlSpecTranslator.ts +++ b/src/yamlSpecTranslator.ts @@ -101,11 +101,30 @@ export function translateTestCase(testCase: TestCase, index: number, specName?: const steps: EvalStep[] = []; // 1. agent.create_session - steps.push({ + const createSessionStep: EvalStep = { type: 'agent.create_session', id: 'cs', use_agent_api: true, - }); + }; + + if (testCase.contextVariables && testCase.contextVariables.length > 0) { + // Validate for duplicate names + const names = testCase.contextVariables.map((cv) => cv.name); + const duplicates = names.filter((name, idx) => names.indexOf(name) !== idx); + if (duplicates.length > 0) { + throw new Error( + `Duplicate contextVariable names found in test case ${index}: ${[...new Set(duplicates)].join( + ', ' + )}. Each contextVariable name must be unique.` + ); + } + + createSessionStep.context_variables = Object.fromEntries( + testCase.contextVariables.map((cv) => [cv.name, cv.value]) + ); + } + + steps.push(createSessionStep); // 2. Conversation history — only user messages become send_message steps let historyIdx = 0; diff --git a/test/evalNormalizer.test.ts b/test/evalNormalizer.test.ts index 7c15e352..3ddab444 100644 --- a/test/evalNormalizer.test.ts +++ b/test/evalNormalizer.test.ts @@ -337,6 +337,60 @@ describe('evalNormalizer', () => { expect(result[0]).to.have.property('generated_output', 'test'); }); + it('should preserve state field on agent.create_session', () => { + const steps: EvalStep[] = [ + { + type: 'agent.create_session', + id: 's1', + planner_id: 'p1', + state: { + state: { + plannerType: 'Atlas', + sessionContext: {}, + conversationHistory: [], + lastExecution: {}, + }, + }, + }, + ]; + const result = stripUnrecognizedFields(steps); + expect(result[0]).to.have.property('state'); + expect((result[0] as Record).state).to.deep.equal(steps[0].state); + }); + + it('should preserve setupSessionContext on agent.create_session', () => { + const steps: EvalStep[] = [ + { + type: 'agent.create_session', + id: 's1', + planner_id: 'p1', + setupSessionContext: { tags: { botId: '0Xx123', botVersionId: '0X9456' } }, + }, + ]; + const result = stripUnrecognizedFields(steps); + expect(result[0]).to.have.property('setupSessionContext'); + expect((result[0] as Record).setupSessionContext).to.deep.equal({ + tags: { botId: '0Xx123', botVersionId: '0X9456' }, + }); + }); + + it('should preserve context_variables on agent.create_session', () => { + const steps: EvalStep[] = [ + { + type: 'agent.create_session', + id: 's1', + use_agent_api: true, + context_variables: { RoutableId: '0Mw123', CaseId: '500456' }, + }, + ]; + const result = stripUnrecognizedFields(steps); + expect(result[0]).to.have.property('context_variables'); + expect((result[0] as Record).context_variables).to.deep.equal({ + RoutableId: '0Mw123', + CaseId: '500456', + }); + }); + it('should not strip fields from unknown types', () => { const steps: EvalStep[] = [{ type: 'evaluator.future_type', id: 'e1', custom_field: 'keep' }]; const result = stripUnrecognizedFields(steps); diff --git a/test/mock-projects/agent-generate-template/specs/eval-with-context.yaml b/test/mock-projects/agent-generate-template/specs/eval-with-context.yaml new file mode 100644 index 00000000..c2d28d42 --- /dev/null +++ b/test/mock-projects/agent-generate-template/specs/eval-with-context.yaml @@ -0,0 +1,21 @@ +name: Agent_Context_Test +description: Test agent with contextVariables +subjectType: AGENT +subjectName: Local_Info_Agent +testCases: + - utterance: 'What is the weather?' + expectedTopic: Weather_and_Temperature_Information + expectedActions: [] + expectedOutcome: 'The agent should provide weather information' + contextVariables: + - name: CaseId + value: '500ABC123' + - name: RoutableId + value: '0MwXYZ456' + - utterance: 'Tell me about the temperature' + expectedTopic: Weather_and_Temperature_Information + expectedActions: [] + expectedOutcome: 'The agent should provide temperature information' + contextVariables: + - name: UserId + value: '005DEF789' diff --git a/test/nuts/agent.test.run-eval.nut.ts b/test/nuts/agent.test.run-eval.nut.ts index 1af21425..e21d4b30 100644 --- a/test/nuts/agent.test.run-eval.nut.ts +++ b/test/nuts/agent.test.run-eval.nut.ts @@ -29,6 +29,7 @@ describe('agent test run-eval', function () { const mockProjectDir = join(process.cwd(), 'test', 'mock-projects', 'agent-generate-template', 'specs'); const jsonPayloadPath = join(mockProjectDir, 'eval-payload.json'); const yamlSpecPath = join(mockProjectDir, 'eval-test-spec.yaml'); + const yamlWithContextPath = join(mockProjectDir, 'eval-with-context.yaml'); before(async function () { this.timeout(30 * 60 * 1000); // 30 minutes for setup @@ -84,6 +85,18 @@ describe('agent test run-eval', function () { expect(output?.result).to.be.ok; expect(output?.result.tests).to.be.an('array'); }); + + it('should handle YAML spec with contextVariables', async () => { + const command = `agent test run-eval --spec ${yamlWithContextPath} --target-org ${getUsername()} --json`; + // Don't enforce exit code 0 since the command exits with 1 if tests fail + const output = execCmd(command).jsonOutput; + + // Verify the command succeeds with contextVariables + expect(output?.result).to.be.ok; + expect(output?.result.tests).to.be.an('array'); + expect(output?.result.tests.length).to.be.greaterThan(0); + expect(output?.result.summary).to.be.ok; + }); }); describe('run-eval with flags', () => { @@ -179,6 +192,8 @@ describe('agent test run-eval', function () { expect(firstTest).to.have.property('status'); expect(firstTest).to.have.property('evaluations'); expect(firstTest?.evaluations).to.be.an('array'); + expect(firstTest).to.have.property('outputs'); + expect(firstTest?.outputs).to.be.an('array'); }); it('should include summary with all metrics', async () => { diff --git a/test/yamlSpecTranslator.test.ts b/test/yamlSpecTranslator.test.ts index 8b2776f0..40914c67 100644 --- a/test/yamlSpecTranslator.test.ts +++ b/test/yamlSpecTranslator.test.ts @@ -622,6 +622,90 @@ testCases: [] expect(result.id).to.equal('My_Spec_case_2'); }); + it('injects context_variables when contextVariables present', () => { + const tc: TestCase = { + utterance: 'Help with my camera', + expectedTopic: 'Product_Help', + expectedActions: undefined, + expectedOutcome: undefined, + contextVariables: [ + { name: 'RoutableId', value: '0Mw123' }, + { name: 'CaseId', value: '500456' }, + ], + }; + const result = translateTestCase(tc, 0); + const cs = result.steps.find((s) => s.type === 'agent.create_session'); + expect(cs).to.have.property('context_variables'); + expect((cs as Record).context_variables).to.deep.equal({ + RoutableId: '0Mw123', + CaseId: '500456', + }); + }); + + it('does not add context_variables when contextVariables absent', () => { + const tc: TestCase = { + utterance: 'Hello', + expectedTopic: undefined, + expectedActions: undefined, + expectedOutcome: undefined, + }; + const result = translateTestCase(tc, 0); + const cs = result.steps.find((s) => s.type === 'agent.create_session'); + expect(cs).to.not.have.property('context_variables'); + }); + + it('does not add context_variables when contextVariables is empty', () => { + const tc: TestCase = { + utterance: 'Hello', + expectedTopic: undefined, + expectedActions: undefined, + expectedOutcome: undefined, + contextVariables: [], + }; + const result = translateTestCase(tc, 0); + const cs = result.steps.find((s) => s.type === 'agent.create_session'); + expect(cs).to.not.have.property('context_variables'); + }); + + it('throws error when contextVariables has duplicate names', () => { + const tc: TestCase = { + utterance: 'Hello', + expectedTopic: undefined, + expectedActions: undefined, + expectedOutcome: undefined, + contextVariables: [ + { name: 'CaseId', value: '500123' }, + { name: 'RoutableId', value: '0Mw456' }, + { name: 'CaseId', value: '500789' }, + ], + }; + expect(() => translateTestCase(tc, 0)).to.throw(/Duplicate contextVariable names found in test case 0: CaseId/); + }); + + it('translates contextVariables to context_variables object format', () => { + const tc: TestCase = { + utterance: 'Test with context', + expectedTopic: 'Test_Topic', + expectedActions: undefined, + expectedOutcome: undefined, + contextVariables: [ + { name: 'CaseId', value: '500ABC' }, + { name: 'RoutableId', value: '0MwXYZ' }, + { name: 'UserId', value: '005DEF' }, + ], + }; + const result = translateTestCase(tc, 0); + const cs = result.steps.find((s) => s.type === 'agent.create_session') as Record; + + expect(cs).to.have.property('context_variables'); + const contextVars = cs.context_variables as Record; + expect(contextVars).to.deep.equal({ + CaseId: '500ABC', + RoutableId: '0MwXYZ', + UserId: '005DEF', + }); + }); + it('sets use_agent_api true on create_session', () => { const tc: TestCase = { utterance: 'Hello',