Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions messages/agent.generate.test-spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ To generate a specific agent test case, this command prompts you for this inform
- (Optional) Custom evaluation: Test an agent's response for specific strings or numbers.
- (Optional) Conversation history: Boilerplate for additional context you can add to the test in the form of a conversation history.

You can manually add contextVariables to test cases in the generated YAML file to inject contextual data (such as CaseId or RoutableId) into agent sessions. This is useful for testing agent behavior with different contextual information.

When your test spec is ready, you then run the "agent test create" command to actually create the test in your org and synchronize the metadata with your DX project. The metadata type for an agent test is AiEvaluationDefinition.

If you have an existing AiEvaluationDefinition metadata XML file in your DX project, you can generate its equivalent YAML test spec file with the --from-definition flag.
Expand Down
14 changes: 9 additions & 5 deletions messages/agent.test.run-eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ Run evaluation tests against an Agentforce agent.

Execute rich evaluation tests against an Agentforce agent using the Einstein Evaluation API. Supports both YAML test specs (same format as `sf agent generate test-spec`) and JSON payloads.

When you provide a YAML test spec, the command automatically translates test cases into Evaluation API calls and infers the agent name from the spec's `subjectName` field. This means you can use the same test spec with both `sf agent test run` and `sf agent test run-eval`.
When you provide a YAML test spec, the command automatically translates test cases into Evaluation API calls and infers the agent name from the spec's `subjectName` field. This means you can use the same test spec with both `sf agent test run` and `sf agent test run-eval`. YAML test specs also support contextVariables, which allow you to inject contextual data (such as CaseId or RoutableId) into agent sessions for testing with different contexts.

When you provide a JSON payload, it's sent directly to the API with optional normalization. The normalizer auto-corrects common field name mistakes, converts shorthand references to JSONPath, and injects defaults. Use `--no-normalize` to disable this auto-normalization.
When you provide a JSON payload, it's sent directly to the API with optional normalization. The normalizer auto-corrects common field name mistakes, converts shorthand references to JSONPath, and injects defaults. Use `--no-normalize` to disable this auto-normalization. JSON payloads can also include context_variables on agent.create_session steps for the same contextual testing capabilities.

Supports 8+ evaluator types, including topic routing assertions, action invocation checks, string/numeric assertions, semantic similarity scoring, and LLM-based quality ratings.

# flags.spec.summary

Path to test spec file (YAML or JSON). Use `-` for stdin.
Path to test spec file (YAML or JSON). Supports reading from stdin when piping content.

# flags.api-name.summary

Expand Down Expand Up @@ -54,9 +54,13 @@ Disable auto-normalization of field names and shorthand references.

<%= config.bin %> <%= command.id %> --spec tests/my-agent-testSpec.yaml --target-org my-org --result-format junit

- Pipe JSON payload from stdin:
- Run tests with contextVariables to inject contextual data into agent sessions (add contextVariables to test cases in your YAML spec):

$ echo '{"tests":[...]}' | <%= config.bin %> <%= command.id %> --spec - --target-org my-org
<%= config.bin %> <%= command.id %> --spec tests/agent-with-context.yaml --target-org my-org

- Pipe JSON payload from stdin (--spec flag is automatically populated from stdin):

$ echo '{"tests":[...]}' | <%= config.bin %> <%= command.id %> --spec --target-org my-org

# info.batchProgress

Expand Down
6 changes: 5 additions & 1 deletion schemas/agent-test-run__eval.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,13 @@
"evaluations": {
"type": "array",
"items": {}
},
"outputs": {
"type": "array",
"items": {}
}
},
"required": ["id", "status", "evaluations"],
"required": ["id", "status", "evaluations", "outputs"],
"additionalProperties": false
}
},
Expand Down
5 changes: 3 additions & 2 deletions src/commands/agent/test/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Messages.importMessagesDirectoryFromMetaUrl(import.meta.url);
const messages = Messages.loadMessages('@salesforce/plugin-agent', 'agent.test.run-eval');

export type RunEvalResult = {
tests: Array<{ id: string; status: string; evaluations: unknown[] }>;
tests: Array<{ id: string; status: string; evaluations: unknown[]; outputs: unknown[] }>;
summary: { passed: number; failed: number; scored: number; errors: number };
};

Expand Down Expand Up @@ -122,7 +122,7 @@ function buildResultSummary(mergedResponse: EvalApiResponse): {
testSummaries: RunEvalResult['tests'];
} {
const summary = { passed: 0, failed: 0, scored: 0, errors: 0 };
const testSummaries: Array<{ id: string; status: string; evaluations: unknown[] }> = [];
const testSummaries: Array<{ id: string; status: string; evaluations: unknown[]; outputs: unknown[] }> = [];

for (const testResult of mergedResponse.results ?? []) {
const tr = testResult as Record<string, unknown>;
Expand All @@ -143,6 +143,7 @@ function buildResultSummary(mergedResponse: EvalApiResponse): {
id: testId,
status: failed > 0 || testErrors.length > 0 ? 'failed' : 'passed',
evaluations: evalResults,
outputs: (tr.outputs as unknown[]) ?? [],
});
}

Expand Down
12 changes: 11 additions & 1 deletion src/evalNormalizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,17 @@ const ASSERTION_VALID_FIELDS = new Set([
]);

const VALID_AGENT_FIELDS: Record<string, Set<string>> = {
'agent.create_session': new Set(['type', 'id', 'agent_id', 'agent_version_id', 'use_agent_api', 'planner_id']),
'agent.create_session': new Set([
'type',
'id',
'agent_id',
'agent_version_id',
'use_agent_api',
'planner_id',
'state',
'setupSessionContext',
'context_variables',
]),
'agent.send_message': new Set(['type', 'id', 'session_id', 'utterance']),
'agent.get_state': new Set(['type', 'id', 'session_id']),
};
Expand Down
23 changes: 21 additions & 2 deletions src/yamlSpecTranslator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,30 @@ export function translateTestCase(testCase: TestCase, index: number, specName?:
const steps: EvalStep[] = [];

// 1. agent.create_session
steps.push({
const createSessionStep: EvalStep = {
type: 'agent.create_session',
id: 'cs',
use_agent_api: true,
});
};

if (testCase.contextVariables && testCase.contextVariables.length > 0) {
// Validate for duplicate names
const names = testCase.contextVariables.map((cv) => cv.name);
const duplicates = names.filter((name, idx) => names.indexOf(name) !== idx);
if (duplicates.length > 0) {
throw new Error(
`Duplicate contextVariable names found in test case ${index}: ${[...new Set(duplicates)].join(
', '
)}. Each contextVariable name must be unique.`
);
}

createSessionStep.context_variables = Object.fromEntries(
testCase.contextVariables.map((cv) => [cv.name, cv.value])
);
}

steps.push(createSessionStep);

// 2. Conversation history — only user messages become send_message steps
let historyIdx = 0;
Expand Down
54 changes: 54 additions & 0 deletions test/evalNormalizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,60 @@ describe('evalNormalizer', () => {
expect(result[0]).to.have.property('generated_output', 'test');
});

it('should preserve state field on agent.create_session', () => {
const steps: EvalStep[] = [
{
type: 'agent.create_session',
id: 's1',
planner_id: 'p1',
state: {
state: {
plannerType: 'Atlas',
sessionContext: {},
conversationHistory: [],
lastExecution: {},
},
},
},
];
const result = stripUnrecognizedFields(steps);
expect(result[0]).to.have.property('state');
expect((result[0] as Record<string, unknown>).state).to.deep.equal(steps[0].state);
});

it('should preserve setupSessionContext on agent.create_session', () => {
const steps: EvalStep[] = [
{
type: 'agent.create_session',
id: 's1',
planner_id: 'p1',
setupSessionContext: { tags: { botId: '0Xx123', botVersionId: '0X9456' } },
},
];
const result = stripUnrecognizedFields(steps);
expect(result[0]).to.have.property('setupSessionContext');
expect((result[0] as Record<string, unknown>).setupSessionContext).to.deep.equal({
tags: { botId: '0Xx123', botVersionId: '0X9456' },
});
});

it('should preserve context_variables on agent.create_session', () => {
const steps: EvalStep[] = [
{
type: 'agent.create_session',
id: 's1',
use_agent_api: true,
context_variables: { RoutableId: '0Mw123', CaseId: '500456' },
},
];
const result = stripUnrecognizedFields(steps);
expect(result[0]).to.have.property('context_variables');
expect((result[0] as Record<string, unknown>).context_variables).to.deep.equal({
RoutableId: '0Mw123',
CaseId: '500456',
});
});

it('should not strip fields from unknown types', () => {
const steps: EvalStep[] = [{ type: 'evaluator.future_type', id: 'e1', custom_field: 'keep' }];
const result = stripUnrecognizedFields(steps);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Agent_Context_Test
description: Test agent with contextVariables
subjectType: AGENT
subjectName: Local_Info_Agent
testCases:
- utterance: 'What is the weather?'
expectedTopic: Weather_and_Temperature_Information
expectedActions: []
expectedOutcome: 'The agent should provide weather information'
contextVariables:
- name: CaseId
value: '500ABC123'
- name: RoutableId
value: '0MwXYZ456'
- utterance: 'Tell me about the temperature'
expectedTopic: Weather_and_Temperature_Information
expectedActions: []
expectedOutcome: 'The agent should provide temperature information'
contextVariables:
- name: UserId
value: '005DEF789'
15 changes: 15 additions & 0 deletions test/nuts/agent.test.run-eval.nut.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ describe('agent test run-eval', function () {
const mockProjectDir = join(process.cwd(), 'test', 'mock-projects', 'agent-generate-template', 'specs');
const jsonPayloadPath = join(mockProjectDir, 'eval-payload.json');
const yamlSpecPath = join(mockProjectDir, 'eval-test-spec.yaml');
const yamlWithContextPath = join(mockProjectDir, 'eval-with-context.yaml');

before(async function () {
this.timeout(30 * 60 * 1000); // 30 minutes for setup
Expand Down Expand Up @@ -84,6 +85,18 @@ describe('agent test run-eval', function () {
expect(output?.result).to.be.ok;
expect(output?.result.tests).to.be.an('array');
});

it('should handle YAML spec with contextVariables', async () => {
const command = `agent test run-eval --spec ${yamlWithContextPath} --target-org ${getUsername()} --json`;
// Don't enforce exit code 0 since the command exits with 1 if tests fail
const output = execCmd<RunEvalResult>(command).jsonOutput;

// Verify the command succeeds with contextVariables
expect(output?.result).to.be.ok;
expect(output?.result.tests).to.be.an('array');
expect(output?.result.tests.length).to.be.greaterThan(0);
expect(output?.result.summary).to.be.ok;
});
});

describe('run-eval with flags', () => {
Expand Down Expand Up @@ -179,6 +192,8 @@ describe('agent test run-eval', function () {
expect(firstTest).to.have.property('status');
expect(firstTest).to.have.property('evaluations');
expect(firstTest?.evaluations).to.be.an('array');
expect(firstTest).to.have.property('outputs');
expect(firstTest?.outputs).to.be.an('array');
});

it('should include summary with all metrics', async () => {
Expand Down
84 changes: 84 additions & 0 deletions test/yamlSpecTranslator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,90 @@ testCases: []
expect(result.id).to.equal('My_Spec_case_2');
});

it('injects context_variables when contextVariables present', () => {
const tc: TestCase = {
utterance: 'Help with my camera',
expectedTopic: 'Product_Help',
expectedActions: undefined,
expectedOutcome: undefined,
contextVariables: [
{ name: 'RoutableId', value: '0Mw123' },
{ name: 'CaseId', value: '500456' },
],
};
const result = translateTestCase(tc, 0);
const cs = result.steps.find((s) => s.type === 'agent.create_session');
expect(cs).to.have.property('context_variables');
expect((cs as Record<string, unknown>).context_variables).to.deep.equal({
RoutableId: '0Mw123',
CaseId: '500456',
});
});

it('does not add context_variables when contextVariables absent', () => {
const tc: TestCase = {
utterance: 'Hello',
expectedTopic: undefined,
expectedActions: undefined,
expectedOutcome: undefined,
};
const result = translateTestCase(tc, 0);
const cs = result.steps.find((s) => s.type === 'agent.create_session');
expect(cs).to.not.have.property('context_variables');
});

it('does not add context_variables when contextVariables is empty', () => {
const tc: TestCase = {
utterance: 'Hello',
expectedTopic: undefined,
expectedActions: undefined,
expectedOutcome: undefined,
contextVariables: [],
};
const result = translateTestCase(tc, 0);
const cs = result.steps.find((s) => s.type === 'agent.create_session');
expect(cs).to.not.have.property('context_variables');
});

it('throws error when contextVariables has duplicate names', () => {
const tc: TestCase = {
utterance: 'Hello',
expectedTopic: undefined,
expectedActions: undefined,
expectedOutcome: undefined,
contextVariables: [
{ name: 'CaseId', value: '500123' },
{ name: 'RoutableId', value: '0Mw456' },
{ name: 'CaseId', value: '500789' },
],
};
expect(() => translateTestCase(tc, 0)).to.throw(/Duplicate contextVariable names found in test case 0: CaseId/);
});

it('translates contextVariables to context_variables object format', () => {
const tc: TestCase = {
utterance: 'Test with context',
expectedTopic: 'Test_Topic',
expectedActions: undefined,
expectedOutcome: undefined,
contextVariables: [
{ name: 'CaseId', value: '500ABC' },
{ name: 'RoutableId', value: '0MwXYZ' },
{ name: 'UserId', value: '005DEF' },
],
};
const result = translateTestCase(tc, 0);
const cs = result.steps.find((s) => s.type === 'agent.create_session') as Record<string, unknown>;

expect(cs).to.have.property('context_variables');
const contextVars = cs.context_variables as Record<string, string>;
expect(contextVars).to.deep.equal({
CaseId: '500ABC',
RoutableId: '0MwXYZ',
UserId: '005DEF',
});
});

it('sets use_agent_api true on create_session', () => {
const tc: TestCase = {
utterance: 'Hello',
Expand Down