diff --git a/python/packages/core/agent_framework/__init__.py b/python/packages/core/agent_framework/__init__.py index 356051da3f..52368df476 100644 --- a/python/packages/core/agent_framework/__init__.py +++ b/python/packages/core/agent_framework/__init__.py @@ -70,6 +70,7 @@ Evaluator, ExpectedToolCall, LocalEvaluator, + RubricScore, evaluate_agent, evaluate_workflow, evaluator, @@ -425,6 +426,7 @@ "ResponseStream", "Role", "RoleLiteral", + "RubricScore", "RunContext", "Runner", "RunnerContext", diff --git a/python/packages/core/agent_framework/_agents.py b/python/packages/core/agent_framework/_agents.py index 585898ae52..65506cadc6 100644 --- a/python/packages/core/agent_framework/_agents.py +++ b/python/packages/core/agent_framework/_agents.py @@ -444,6 +444,49 @@ def get_session(self, service_session_id: str, *, session_id: str | None = None) """ return AgentSession(session_id=session_id, service_session_id=service_session_id) + def as_eval_source( + self, + *, + include_instructions: bool = True, + include_tools: bool = True, + include_context_providers: bool = False, + include_examples: bool = False, + examples: Sequence[str] | None = None, + ) -> str: + """Render this agent as a textual dossier for rubric-evaluator generation. + + Packages the agent's name, description, instructions, tool + definitions, and optional context-provider class names into a + single plain-text dossier suitable for passing to a rubric + generation pipeline (e.g. ``FoundryEvals.generate_rubric``). + + Defaults are conservative: instructions and tools are included; + examples and context-provider class names are not. + + Keyword Args: + include_instructions: Whether to include the agent's + instructions text. + include_tools: Whether to include tool definitions. + include_context_providers: Whether to include attached + context-provider class names. + include_examples: Whether to include the supplied ``examples``. + examples: Sample queries / interactions to include when + ``include_examples`` is true. + + Returns: + A plain-text dossier describing the agent. + """ + from ._evaluation import _render_agent_dossier # pyright: ignore[reportPrivateUsage] + + return _render_agent_dossier( + self, + include_instructions=include_instructions, + include_tools=include_tools, + include_context_providers=include_context_providers, + include_examples=include_examples, + examples=examples, + ) + async def _run_after_providers( self, *, diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py index 64fab0eacb..b14bdee9b2 100644 --- a/python/packages/core/agent_framework/_evaluation.py +++ b/python/packages/core/agent_framework/_evaluation.py @@ -311,12 +311,15 @@ class EvalScoreResult: score: Numeric score from the evaluator. passed: Whether the item passed this evaluator's threshold. sample: Optional raw evaluator output (rationale, metadata). + dimensions: Per-dimension scores when this evaluator is a rubric + evaluator. ``None`` for non-rubric (e.g. built-in) evaluators. """ name: str score: float passed: bool | None = None sample: dict[str, Any] | None = None + dimensions: list[RubricScore] | None = None @experimental(feature_id=ExperimentalFeature.EVALS) @@ -496,6 +499,313 @@ def raise_for_status(self, msg: str | None = None) -> None: detail += f" Errored items: {', '.join(summaries)}." raise EvalNotPassedError(detail) + def assert_score_at_least( + self, + min_score: float, + *, + evaluator: str | None = None, + msg: str | None = None, + ) -> None: + """Assert every item's score (optionally filtered by evaluator) is ``>= min_score``. + + Designed for CI gates on generated rubric evaluators (e.g. + ``results.assert_score_at_least(0.80)``). Includes any + sub-results from workflow evaluations. + + Args: + min_score: Minimum acceptable score (inclusive). + evaluator: When set, only check scores from the evaluator + whose ``EvalScoreResult.name`` matches. + msg: Optional custom failure message. + + Raises: + EvalNotPassedError: When any matching score is below the threshold. + """ + offenders: list[str] = [] + + def _check(results: EvalResults) -> None: + for item in results.items: + for score in item.scores: + if evaluator is not None and score.name != evaluator: + continue + if score.score < min_score: + offenders.append(f"{item.item_id}/{score.name}={score.score:.3f}") + for sub in results.sub_results.values(): + _check(sub) + + _check(self) + if offenders: + detail = msg or ( + f"{len(offenders)} score(s) below threshold {min_score}" + f"{' for ' + evaluator if evaluator else ''}: {', '.join(offenders[:5])}" + + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "") + ) + raise EvalNotPassedError(detail) + + def assert_dimension_score_at_least( + self, + dimension_id: str, + min_score: float, + *, + evaluator: str | None = None, + require_applicable: bool = False, + msg: str | None = None, + ) -> None: + """Assert every item's score for a rubric *dimension* is ``>= min_score``. + + Walks ``EvalScoreResult.dimensions`` looking for the named + dimension across all items (and sub-results). Non-applicable + dimensions are skipped by default; pass + ``require_applicable=True`` to fail when no applicable score is + produced. + + Args: + dimension_id: Dimension id (matches the rubric definition). + min_score: Minimum acceptable dimension score (inclusive). + evaluator: When set, only consider scores from the evaluator + whose ``EvalScoreResult.name`` matches. + require_applicable: When ``True``, missing or non-applicable + dimension scores raise. Defaults to ``False`` (skip). + msg: Optional custom failure message. + + Raises: + EvalNotPassedError: When the dimension fails the threshold. + """ + offenders: list[str] = [] + missing_items: list[str] = [] + + def _check(results: EvalResults) -> None: + for item in results.items: + found_applicable = False + for score in item.scores: + if evaluator is not None and score.name != evaluator: + continue + if not score.dimensions: + continue + for rs in score.dimensions: + if rs.id != dimension_id: + continue + if not rs.applicable: + continue + found_applicable = True + if rs.score is None or rs.score < min_score: + offenders.append( + f"{item.item_id}/{score.name}/{dimension_id}=" + f"{rs.score if rs.score is not None else 'None'}" + ) + if require_applicable and not found_applicable: + missing_items.append(item.item_id) + for sub in results.sub_results.values(): + _check(sub) + + _check(self) + problems: list[str] = [] + if offenders: + problems.append( + f"{len(offenders)} dimension score(s) for '{dimension_id}' below {min_score}: " + f"{', '.join(offenders[:5])}" + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "") + ) + if missing_items: + problems.append( + f"Dimension '{dimension_id}' not applicable on {len(missing_items)} item(s): " + f"{', '.join(missing_items[:5])}" + ) + if problems: + raise EvalNotPassedError(msg or "; ".join(problems)) + + def assert_no_failed_items(self, msg: str | None = None) -> None: + """Assert no item ended in ``fail`` or ``error`` status. + + Includes any sub-results from workflow evaluations. + + Args: + msg: Optional custom failure message. + + Raises: + EvalNotPassedError: When any item failed or errored. + """ + bad: list[str] = [] + + def _check(results: EvalResults) -> None: + for item in results.items: + if item.is_failed or item.is_error: + bad.append(f"{item.item_id}:{item.status}") + for sub in results.sub_results.values(): + _check(sub) + + _check(self) + if bad: + detail = msg or ( + f"{len(bad)} item(s) failed or errored: {', '.join(bad[:5])}" + + (f" (+{len(bad) - 5} more)" if len(bad) > 5 else "") + ) + raise EvalNotPassedError(detail) + + +# endregion + +# region Generated rubric evaluators + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class RubricScore: + """A single dimension's score from a rubric-based evaluator run. + + Rubric evaluators emit one ``RubricScore`` per dimension per item. + Attached to :class:`EvalScoreResult` as a typed view of the raw + ``properties.rubric_scores`` payload returned by providers such as + Foundry's generated rubric evaluators. + + Attributes: + id: Dimension id (matches the rubric definition). + score: Numeric score, or ``None`` when the dimension was marked + non-applicable for this item. + applicable: Whether the dimension applied to this item. + weight: Dimension weight (mirrors the rubric definition). + reason: Short rationale produced by the evaluator. + """ + + id: str + score: int | None + applicable: bool + weight: int + reason: str + + +# endregion + +# region Eval source rendering + + +def _render_agent_dossier( + agent: Any, + *, + include_instructions: bool, + include_tools: bool, + include_context_providers: bool, + include_examples: bool, + examples: Sequence[str] | None, +) -> str: + """Render a structured, plain-text dossier of an agent for rubric generation.""" + lines: list[str] = [] + name = getattr(agent, "name", None) or "" + description = getattr(agent, "description", None) + lines.append(f"Agent name: {name}") + if description: + lines.append(f"Description: {description}") + + if include_instructions: + instructions: str | None = None + default_options: Any = getattr(agent, "default_options", None) + if isinstance(default_options, dict): + raw_instr: Any = cast("dict[str, Any]", default_options).get("instructions") + if isinstance(raw_instr, str) and raw_instr.strip(): + instructions = raw_instr + if instructions is None: + raw_instr = getattr(agent, "instructions", None) + if isinstance(raw_instr, str) and raw_instr.strip(): + instructions = raw_instr + if instructions: + lines.append("") + lines.append("Instructions:") + lines.append(instructions.strip()) + + if include_tools: + tool_defs = AgentEvalConverter.extract_tools(agent) + if tool_defs: + lines.append("") + lines.append("Tools:") + for tool in tool_defs: + tool_line = f"- {tool['name']}" + tool_desc = tool.get("description") + if tool_desc: + tool_line += f": {tool_desc}" + lines.append(tool_line) + params = tool.get("parameters") + if params: + try: + params_json = json.dumps(params, sort_keys=True) + except (TypeError, ValueError): + params_json = str(params) + lines.append(f" parameters: {params_json}") + + if include_context_providers: + providers = getattr(agent, "context_providers", None) + if providers: + lines.append("") + lines.append("Context providers:") + for provider in providers: + lines.append(f"- {type(provider).__name__}") + + if include_examples and examples: + lines.append("") + lines.append("Examples:") + for idx, example in enumerate(examples, start=1): + lines.append(f"{idx}. {example}") + + return "\n".join(lines).strip() + + +def _render_workflow_dossier( # pyright: ignore[reportUnusedFunction] + workflow: Workflow, + *, + include_instructions: bool, + include_tools: bool, + include_context_providers: bool, + include_examples: bool, + examples: Sequence[str] | None, + include_topology: bool, +) -> str: + """Render a structured, plain-text dossier of a workflow for rubric generation.""" + from ._workflows._agent_executor import AgentExecutor as _AE + + lines: list[str] = [] + name = workflow.name or "" + lines.append(f"Workflow name: {name}") + if workflow.description: + lines.append(f"Description: {workflow.description}") + + if include_topology: + try: + topology = json.dumps(workflow.to_dict(), sort_keys=True, default=str) + except (TypeError, ValueError) as exc: + logger.debug("Workflow.to_dict() failed during eval source export: %s", exc) + topology = None + if topology: + lines.append("") + lines.append("Topology (JSON):") + lines.append(topology) + + agent_executors: list[tuple[str, Any]] = [] + for executor_id, executor in workflow.executors.items(): + if isinstance(executor, _AE): + agent_executors.append((executor_id, executor.agent)) + + if agent_executors: + lines.append("") + lines.append("Agents:") + for executor_id, agent in agent_executors: + lines.append("") + lines.append(f"Executor: {executor_id}") + dossier = _render_agent_dossier( + agent, + include_instructions=include_instructions, + include_tools=include_tools, + include_context_providers=include_context_providers, + include_examples=False, + examples=None, + ) + lines.append(dossier) + + if include_examples and examples: + lines.append("") + lines.append("Examples:") + for idx, example in enumerate(examples, start=1): + lines.append(f"{idx}. {example}") + + return "\n".join(lines).strip() + # endregion diff --git a/python/packages/core/agent_framework/_workflows/_workflow.py b/python/packages/core/agent_framework/_workflows/_workflow.py index 0493cd015f..bce7569ef1 100644 --- a/python/packages/core/agent_framework/_workflows/_workflow.py +++ b/python/packages/core/agent_framework/_workflows/_workflow.py @@ -410,6 +410,55 @@ def to_json(self) -> str: """Serialize the workflow definition to JSON.""" return json.dumps(self.to_dict()) + def as_eval_source( + self, + *, + include_instructions: bool = True, + include_tools: bool = True, + include_context_providers: bool = False, + include_examples: bool = False, + examples: Sequence[str] | None = None, + include_topology: bool = True, + ) -> str: + """Render this workflow as a textual dossier for rubric-evaluator generation. + + Produces a plain-text dossier containing the workflow's name, + description, optional JSON-encoded topology (from + :meth:`Workflow.to_dict`), and per-agent dossiers extracted from + ``AgentExecutor`` nodes. Suitable for passing to a rubric + generation pipeline (e.g. ``FoundryEvals.generate_rubric``). + + Defaults are conservative: per-agent instructions and tools are + included, plus the JSON-encoded topology. Examples and + context-provider class names are excluded by default. + + Keyword Args: + include_instructions: Per-agent instructions inclusion. + include_tools: Per-agent tool-definition inclusion. + include_context_providers: Per-agent context-provider + inclusion. + include_examples: Whether to include workflow-level + ``examples``. + examples: Sample queries / interactions to include when + ``include_examples`` is true. + include_topology: Whether to embed the JSON-encoded workflow + topology in the rendered dossier. + + Returns: + A plain-text dossier describing the workflow. + """ + from .._evaluation import _render_workflow_dossier # pyright: ignore[reportPrivateUsage] + + return _render_workflow_dossier( + self, + include_instructions=include_instructions, + include_tools=include_tools, + include_context_providers=include_context_providers, + include_examples=include_examples, + examples=examples, + include_topology=include_topology, + ) + def get_start_executor(self) -> Executor: """Get the starting executor of the workflow. diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py index 96b0e1a391..e4c37dfb4b 100644 --- a/python/packages/core/tests/core/test_local_eval.py +++ b/python/packages/core/tests/core/test_local_eval.py @@ -5,14 +5,20 @@ from __future__ import annotations import inspect +from typing import Any import pytest from agent_framework._evaluation import ( CheckResult, EvalItem, + EvalItemResult, + EvalNotPassedError, + EvalResults, + EvalScoreResult, ExpectedToolCall, LocalEvaluator, + RubricScore, _coerce_result, evaluator, keyword_check, @@ -1010,19 +1016,300 @@ def test_all_passed_parent_fails_when_own_counts_fail(self): # --------------------------------------------------------------------------- -# r5 review: _build_overall_item with empty outputs +# Rubric assertions (EvalResults.assert_*) # --------------------------------------------------------------------------- -class TestBuildOverallItemEmpty: - """Test _build_overall_item returns None for empty workflow outputs.""" +def _rubric_results(*scores_per_item: list[EvalScoreResult]) -> EvalResults: + items = [ + EvalItemResult(item_id=f"item-{i}", status="pass", scores=scores) for i, scores in enumerate(scores_per_item) + ] + return EvalResults( + provider="test", + eval_id="ev1", + run_id="run1", + result_counts={"passed": len(items), "failed": 0, "errored": 0, "total": len(items)}, + items=items, + ) + + +class TestRubricAssertions: + """Tests for EvalResults.assert_dimension_score_at_least.""" + + def test_dimension_at_or_above_threshold_passes(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="policy", + score=0.9, + dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")], + ) + ], + ) + # Should not raise. + results.assert_dimension_score_at_least("clarity", 3) + + def test_dimension_below_threshold_raises(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="policy", + score=0.5, + dimensions=[RubricScore(id="clarity", score=2, applicable=True, weight=1, reason="")], + ) + ], + ) + with pytest.raises(EvalNotPassedError): + results.assert_dimension_score_at_least("clarity", 3) + + def test_non_applicable_skipped_by_default(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="policy", + score=1.0, + dimensions=[RubricScore(id="clarity", score=None, applicable=False, weight=1, reason="n/a")], + ) + ], + ) + # No applicable scores; default behaviour is to skip silently. + results.assert_dimension_score_at_least("clarity", 3) + + def test_require_applicable_raises_when_dimension_absent(self) -> None: + results = _rubric_results( + [EvalScoreResult(name="policy", score=1.0, dimensions=[])], + ) + with pytest.raises(EvalNotPassedError, match="not applicable"): + results.assert_dimension_score_at_least("clarity", 3, require_applicable=True) + + def test_require_applicable_raises_when_filtered_evaluator_missing(self) -> None: + # Regression: previously the (not evaluator or found_any) guard caused + # this case to silently pass even with require_applicable=True. + results = _rubric_results( + [ + EvalScoreResult( + name="other", + score=0.9, + dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")], + ) + ], + ) + with pytest.raises(EvalNotPassedError, match="not applicable"): + results.assert_dimension_score_at_least("clarity", 3, evaluator="policy", require_applicable=True) + + def test_evaluator_filter_isolates_offenders(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="other", + score=0.1, + dimensions=[RubricScore(id="clarity", score=1, applicable=True, weight=1, reason="")], + ), + EvalScoreResult( + name="policy", + score=0.9, + dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")], + ), + ], + ) + # The low-scoring "other" evaluator is filtered out; "policy" passes. + results.assert_dimension_score_at_least("clarity", 3, evaluator="policy") + + +# --------------------------------------------------------------------------- +# Eval source rendering (string dossiers) +# --------------------------------------------------------------------------- + - def test_returns_none_for_empty_outputs(self): +class TestAgentAsEvalSource: + """Tests for BaseAgent.as_eval_source / _render_agent_dossier.""" + + def _make_mock_agent( + self, + *, + name: str = "weather-bot", + description: str | None = "Looks up the weather.", + instructions: str | None = "Be concise. Always cite the source.", + tools: list[Any] | None = None, + context_providers: list[Any] | None = None, + mcp_tools: list[Any] | None = None, + ) -> Any: from unittest.mock import MagicMock - from agent_framework._evaluation import _build_overall_item + from agent_framework._tools import ai_function + + agent = MagicMock() + agent.name = name + agent.description = description + agent.default_options = {"instructions": instructions, "tools": tools or []} + agent.context_providers = context_providers or [] + agent.mcp_tools = mcp_tools or [] + if tools: + normalized: list[Any] = [] + for t in tools: + if callable(t) and not hasattr(t, "parameters"): + normalized.append(ai_function(t)) + else: + normalized.append(t) + agent.default_options["tools"] = normalized + return agent + + def _render(self, agent: Any, **overrides: Any) -> str: + from agent_framework._evaluation import _render_agent_dossier + + kwargs: dict[str, Any] = { + "include_instructions": True, + "include_tools": True, + "include_context_providers": False, + "include_examples": False, + "examples": None, + } + kwargs.update(overrides) + return _render_agent_dossier(agent, **kwargs) + + def test_basic_dossier_includes_name_and_instructions(self): + agent = self._make_mock_agent() + dossier = self._render(agent) + assert isinstance(dossier, str) + assert "Agent name: weather-bot" in dossier + assert "Description: Looks up the weather." in dossier + assert "Instructions:" in dossier + assert "Be concise." in dossier + + def test_tools_section_includes_definitions(self): + def get_weather(city: str) -> str: + """Return the current weather for *city*.""" + return f"sunny in {city}" + + agent = self._make_mock_agent(tools=[get_weather]) + dossier = self._render(agent) + assert "Tools:" in dossier + assert "- get_weather" in dossier + assert '"city"' in dossier + + def test_include_instructions_false_omits_section(self): + agent = self._make_mock_agent() + dossier = self._render(agent, include_instructions=False) + assert "Instructions:" not in dossier + + def test_include_tools_false_omits_section(self): + def get_weather(city: str) -> str: + return f"sunny in {city}" + + agent = self._make_mock_agent(tools=[get_weather]) + dossier = self._render(agent, include_tools=False) + assert "Tools:" not in dossier + + def test_context_providers_excluded_by_default_but_included_when_opted_in(self): + class StubProvider: + pass + + agent = self._make_mock_agent(context_providers=[StubProvider()]) + default_dossier = self._render(agent) + assert "Context providers:" not in default_dossier + + opt_in_dossier = self._render(agent, include_context_providers=True) + assert "Context providers:" in opt_in_dossier + assert "- StubProvider" in opt_in_dossier + + def test_examples_excluded_by_default_but_included_when_opted_in(self): + agent = self._make_mock_agent() + default_dossier = self._render(agent, examples=["What's the weather in NYC?"]) + assert "Examples:" not in default_dossier + + opt_in_dossier = self._render( + agent, + include_examples=True, + examples=["What's the weather in NYC?"], + ) + assert "Examples:" in opt_in_dossier + assert "What's the weather in NYC?" in opt_in_dossier + + def test_base_agent_method_returns_dossier_string(self): + from agent_framework._agents import BaseAgent + + class _ConcreteAgent(BaseAgent): + pass + + agent = _ConcreteAgent(name="test-agent", description="A test agent.") + dossier = agent.as_eval_source() + assert isinstance(dossier, str) + assert "Agent name: test-agent" in dossier + + +class TestWorkflowAsEvalSource: + """Tests for Workflow.as_eval_source / _render_workflow_dossier.""" + + def _build_workflow(self, *, with_agent: bool = False) -> Any: + from unittest.mock import MagicMock - mock_result = MagicMock() - mock_result.get_outputs.return_value = [] - item = _build_overall_item("Hello", mock_result) - assert item is None + from agent_framework._workflows._agent_executor import AgentExecutor + + workflow = MagicMock() + workflow.name = "demo-workflow" + workflow.description = "Routes user questions through a single agent." + workflow.to_dict.return_value = { + "name": "demo-workflow", + "id": "wf_1", + "start_executor_id": "agent_1", + "edge_groups": [], + "executors": {"agent_1": {"type": "AgentExecutor"}}, + } + + if with_agent: + inner_agent = MagicMock() + inner_agent.name = "inner-agent" + inner_agent.description = "Inner agent." + inner_agent.default_options = {"instructions": "Answer politely.", "tools": []} + inner_agent.context_providers = [] + inner_agent.mcp_tools = [] + + executor = MagicMock(spec=AgentExecutor) + executor.agent = inner_agent + workflow.executors = {"agent_1": executor} + else: + workflow.executors = {} + return workflow + + def _render(self, workflow: Any, **overrides: Any) -> str: + from agent_framework._evaluation import _render_workflow_dossier + + kwargs: dict[str, Any] = { + "include_instructions": True, + "include_tools": True, + "include_context_providers": False, + "include_examples": False, + "examples": None, + "include_topology": True, + } + kwargs.update(overrides) + return _render_workflow_dossier(workflow, **kwargs) + + def test_emits_dossier_with_topology(self): + workflow = self._build_workflow() + dossier = self._render(workflow) + assert isinstance(dossier, str) + assert "Workflow name: demo-workflow" in dossier + assert "Topology (JSON):" in dossier + assert '"start_executor_id": "agent_1"' in dossier + + def test_topology_can_be_disabled(self): + workflow = self._build_workflow() + dossier = self._render(workflow, include_topology=False) + assert "Topology (JSON):" not in dossier + + def test_per_agent_dossiers_included_when_executor_is_agent_executor(self): + workflow = self._build_workflow(with_agent=True) + dossier = self._render(workflow) + assert "Agents:" in dossier + assert "Executor: agent_1" in dossier + assert "Agent name: inner-agent" in dossier + assert "Answer politely." in dossier + + def test_workflow_examples_excluded_by_default(self): + workflow = self._build_workflow() + default_dossier = self._render(workflow, examples=["Hi"]) + assert "Examples:" not in default_dossier + + opt_in_dossier = self._render(workflow, examples=["Hi"], include_examples=True) + assert "Examples:" in opt_in_dossier diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py index 002e63f8a6..efbe0b8d24 100644 --- a/python/packages/foundry/agent_framework_foundry/__init__.py +++ b/python/packages/foundry/agent_framework_foundry/__init__.py @@ -10,10 +10,22 @@ FoundryEmbeddingSettings, RawFoundryEmbeddingClient, ) +from ._evals_config import ( + RubricGenerationSpec, + RubricSourceSpec, + build_sources, + load_evals_config, + parse_evals_config, +) from ._foundry_evals import ( + EvalGenerationSource, FoundryEvals, + GeneratedEvaluatorRef, + RubricDimension, + agent_as_eval_source, evaluate_foundry_target, evaluate_traces, + workflow_as_eval_source, ) from ._memory_provider import FoundryMemoryProvider @@ -23,6 +35,7 @@ __version__ = "0.0.0" __all__ = [ + "EvalGenerationSource", "FoundryAgent", "FoundryAgentOptions", "FoundryChatClient", @@ -32,11 +45,20 @@ "FoundryEmbeddingSettings", "FoundryEvals", "FoundryMemoryProvider", + "GeneratedEvaluatorRef", "RawFoundryAgent", "RawFoundryAgentChatClient", "RawFoundryChatClient", "RawFoundryEmbeddingClient", + "RubricDimension", + "RubricGenerationSpec", + "RubricSourceSpec", "__version__", + "agent_as_eval_source", + "build_sources", "evaluate_foundry_target", "evaluate_traces", + "load_evals_config", + "parse_evals_config", + "workflow_as_eval_source", ] diff --git a/python/packages/foundry/agent_framework_foundry/_evals_config.py b/python/packages/foundry/agent_framework_foundry/_evals_config.py new file mode 100644 index 0000000000..5f45e2854b --- /dev/null +++ b/python/packages/foundry/agent_framework_foundry/_evals_config.py @@ -0,0 +1,403 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""YAML-driven evaluator configuration for rubric generation and evaluation. + +Defines the source-controlled config schema described in +``adaptive-evals-draft.md``: a list of named rubric-generation specs that +CI jobs and harnesses parse to drive +:meth:`FoundryEvals.generate_rubric`. + +Example config: + +.. code-block:: yaml + + evaluators: + reservation-agent-quality: + type: foundry.generated_rubric + category: quality + model: gpt-4o + agent: reservation-agent + sources: + - type: agent + include_instructions: true + include_tools: true + - type: dataset + name: reservation-business-rules + version: "1" + +Example loader usage: + +.. code-block:: python + + from agent_framework_foundry import load_evals_config, FoundryEvals + + config = load_evals_config("evaluators.yaml") + spec = config["reservation-agent-quality"] + sources = build_sources(spec, agent=agent) + ref = await FoundryEvals.generate_rubric( + project_client=client, + name=spec.name, + sources=sources, + category=spec.category, + model=spec.model, + display_name=spec.display_name, + description=spec.description, + ) +""" + +from __future__ import annotations + +import os +from collections.abc import Mapping +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Literal, cast + +from agent_framework._feature_stage import ExperimentalFeature, experimental + +from ._foundry_evals import ( + EvalGenerationSource, + agent_as_eval_source, + workflow_as_eval_source, +) + +_RUBRIC_TYPE = "foundry.generated_rubric" + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class RubricSourceSpec: + """A single source entry in a :class:`RubricGenerationSpec` ``sources`` list. + + Mirrors the per-source YAML schema. The :attr:`type` field is the + discriminator; only the fields relevant to each type are read. + + Attributes: + type: One of ``"agent"``, ``"workflow"``, ``"prompt"``, + ``"dataset"``, ``"traces"``. + description: Optional description shown in Foundry UI. + include_instructions: Whether to include the bound agent / + workflow's instructions. Applies to ``"agent"`` and + ``"workflow"`` types. + include_tools: Whether to include the bound agent / workflow's + tools. Applies to ``"agent"`` and ``"workflow"`` types. + include_context_providers: Whether to include attached + context-provider class names. Applies to ``"agent"`` and + ``"workflow"`` types. + include_examples: Whether to include ``examples``. Applies to + ``"agent"`` and ``"workflow"`` types. + include_topology: Whether to include the JSON-encoded topology. + Applies to ``"workflow"`` type. + examples: Optional list of example queries for ``"agent"`` / + ``"workflow"`` sources. + prompt: Rendered dossier for ``"prompt"`` type. + agent_name: Hosted Foundry agent name for ``"agent"`` type with + a server-side reference. + name: Dataset name for ``"dataset"`` type. + version: Pinned dataset version. + metadata: Free-form metadata for ``"traces"`` sources. + """ + + type: Literal["agent", "workflow", "prompt", "dataset", "traces"] + description: str | None = None + include_instructions: bool = True + include_tools: bool = True + include_context_providers: bool = False + include_examples: bool = False + include_topology: bool = True + examples: tuple[str, ...] = field(default_factory=tuple) + prompt: str | None = None + agent_name: str | None = None + name: str | None = None + version: str | None = None + metadata: dict[str, Any] | None = None + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class RubricGenerationSpec: + """A single named entry from an evaluators YAML config. + + Attributes: + name: Evaluator name (the YAML key under ``evaluators``). + type: Discriminator literal. Must be + ``"foundry.generated_rubric"`` for rubric evaluators. + category: ``"quality"`` or ``"safety"``. + model: Optional model deployment to drive generation. + agent: Optional symbolic reference to the agent in the + caller's harness. Resolved by user code into a + :class:`BaseAgent` and passed to + :func:`build_sources`. + workflow: Optional symbolic reference to a workflow. + display_name: Optional human-readable name. + description: Optional description. + sources: List of source specs to feed into generation. When + empty, callers typically default to a single + ``RubricSourceSpec(type='agent')`` or + ``RubricSourceSpec(type='workflow')`` source. + """ + + name: str + type: str = _RUBRIC_TYPE + category: Literal["quality", "safety"] = "quality" + model: str | None = None + agent: str | None = None + workflow: str | None = None + display_name: str | None = None + description: str | None = None + sources: tuple[RubricSourceSpec, ...] = field(default_factory=tuple) + + +@experimental(feature_id=ExperimentalFeature.EVALS) +def load_evals_config(path: str | os.PathLike[str]) -> dict[str, RubricGenerationSpec]: + """Load a YAML evaluators config and return a name -> spec mapping. + + Reads ``path`` (UTF-8) and parses the top-level ``evaluators`` + mapping into :class:`RubricGenerationSpec` instances keyed by name. + + Requires ``PyYAML``. Raises :class:`ImportError` with a helpful + message when PyYAML is not installed. + + Args: + path: Filesystem path to the YAML config. + + Returns: + A dict mapping evaluator name to :class:`RubricGenerationSpec`. + + Raises: + ImportError: If PyYAML is not installed. + ValueError: If the YAML file is malformed. + """ + try: + import yaml # type: ignore[import-untyped] + except ImportError as exc: + raise ImportError("load_evals_config requires PyYAML. Install with `pip install pyyaml`.") from exc + + raw = yaml.safe_load(Path(path).read_text(encoding="utf-8")) + return parse_evals_config(raw) + + +@experimental(feature_id=ExperimentalFeature.EVALS) +def parse_evals_config(data: Any) -> dict[str, RubricGenerationSpec]: + """Parse an already-loaded YAML mapping into rubric-generation specs. + + Useful when callers manage YAML loading themselves (e.g. CI that + interpolates env vars before parsing). + + Args: + data: A mapping with an ``"evaluators"`` key containing a mapping + of evaluator names to spec dicts. + + Returns: + A dict mapping evaluator name to :class:`RubricGenerationSpec`. + + Raises: + ValueError: If the structure is malformed. + """ + if not isinstance(data, Mapping): + raise ValueError("Evaluators config must be a mapping.") + data_map = cast("Mapping[str, Any]", data) + raw_evaluators = data_map.get("evaluators") + if raw_evaluators is None: + raise ValueError("Evaluators config is missing a top-level 'evaluators' key.") + if not isinstance(raw_evaluators, Mapping): + raise ValueError("Evaluators config 'evaluators' entry must be a mapping.") + evaluators = cast("Mapping[str, Any]", raw_evaluators) + + parsed: dict[str, RubricGenerationSpec] = {} + for name, raw in evaluators.items(): + if not isinstance(raw, Mapping): + raise ValueError(f"Evaluator entry {name!r} must be a mapping, got {type(raw).__name__}.") + raw_map = cast("Mapping[str, Any]", raw) + parsed[name] = _parse_spec(name, raw_map) + return parsed + + +def _parse_spec(name: str, raw: Mapping[str, Any]) -> RubricGenerationSpec: + type_value = raw.get("type", _RUBRIC_TYPE) + if type_value != _RUBRIC_TYPE: + raise ValueError(f"Evaluator {name!r} has unsupported type {type_value!r}; expected {_RUBRIC_TYPE!r}.") + category = raw.get("category", "quality") + if category not in ("quality", "safety"): + raise ValueError(f"Evaluator {name!r} has invalid category {category!r}; expected 'quality' or 'safety'.") + + raw_sources_obj: Any = raw.get("sources") or () + if not isinstance(raw_sources_obj, (list, tuple)): + raise ValueError(f"Evaluator {name!r} 'sources' must be a list.") + sources_iter: list[Any] = list(cast("Any", raw_sources_obj)) + sources: list[RubricSourceSpec] = [] + for index, raw_source in enumerate(sources_iter): + if not isinstance(raw_source, Mapping): + raise ValueError( + f"Evaluator {name!r} source entry {index} must be a mapping, got {type(raw_source).__name__}." + ) + sources.append(_parse_source(name, index, cast("Mapping[str, Any]", raw_source))) + + return RubricGenerationSpec( + name=name, + type=type_value, + category=category, + model=raw.get("model"), + agent=raw.get("agent"), + workflow=raw.get("workflow"), + display_name=raw.get("display_name"), + description=raw.get("description"), + sources=tuple(sources), + ) + + +def _parse_source(spec_name: str, index: int, raw: Mapping[str, Any]) -> RubricSourceSpec: + type_value = raw.get("type") + if type_value not in ("agent", "workflow", "prompt", "dataset", "traces"): + raise ValueError( + f"Evaluator {spec_name!r} source {index} has invalid type {type_value!r}; " + "expected one of 'agent', 'workflow', 'prompt', 'dataset', 'traces'." + ) + + examples_raw: Any = raw.get("examples") or () + if not isinstance(examples_raw, (list, tuple)): + raise ValueError(f"Evaluator {spec_name!r} source {index} 'examples' must be a list.") + examples_iter: list[Any] = list(cast("Any", examples_raw)) + examples = tuple(str(e) for e in examples_iter) + + metadata_raw = raw.get("metadata") + if metadata_raw is not None and not isinstance(metadata_raw, Mapping): + raise ValueError(f"Evaluator {spec_name!r} source {index} 'metadata' must be a mapping.") + + return RubricSourceSpec( + type=cast("Any", type_value), + description=raw.get("description"), + include_instructions=bool(raw.get("include_instructions", True)), + include_tools=bool(raw.get("include_tools", True)), + include_context_providers=bool(raw.get("include_context_providers", False)), + include_examples=bool(raw.get("include_examples", False)), + include_topology=bool(raw.get("include_topology", True)), + examples=examples, + prompt=raw.get("prompt"), + agent_name=raw.get("agent_name"), + name=raw.get("name"), + version=str(raw.get("version")) if raw.get("version") is not None else None, + metadata=dict(cast("Mapping[str, Any]", metadata_raw)) if metadata_raw is not None else None, + ) + + +@experimental(feature_id=ExperimentalFeature.EVALS) +def build_sources( + spec: RubricGenerationSpec, + *, + agent: Any | None = None, + workflow: Any | None = None, +) -> list[EvalGenerationSource]: + """Translate a spec's source list into :class:`EvalGenerationSource` instances. + + Resolves each :class:`RubricSourceSpec` against the supplied + ``agent`` and ``workflow`` instances: + + * ``type='agent'`` sources call :func:`agent_as_eval_source` with + the spec's include-flags. If the source carries an + ``agent_name`` the agent is referenced server-side instead. + * ``type='workflow'`` sources call + :func:`workflow_as_eval_source` with the spec's include-flags. + * ``type='prompt'``, ``type='dataset'``, and ``type='traces'`` + sources are translated directly into + :class:`EvalGenerationSource` instances without consulting the + runtime agent or workflow. + + When the spec has no ``sources`` entries, defaults to a single + ``type='agent'`` source when an ``agent`` is provided, or a single + ``type='workflow'`` source when a ``workflow`` is provided. + + Args: + spec: Parsed :class:`RubricGenerationSpec`. + agent: Optional agent instance for ``type='agent'`` sources. + workflow: Optional workflow instance for ``type='workflow'`` + sources. + + Returns: + A list of :class:`EvalGenerationSource` instances ready to pass + to :meth:`FoundryEvals.generate_rubric` as ``sources=``. + + Raises: + ValueError: If a source references an agent or workflow that + was not supplied. + """ + if not spec.sources: + if agent is not None: + return [agent_as_eval_source(agent)] + if workflow is not None: + return [workflow_as_eval_source(workflow)] + raise ValueError(f"Spec {spec.name!r} has no sources and no agent/workflow was provided to build_sources().") + + out: list[EvalGenerationSource] = [] + for src in spec.sources: + if src.type == "agent": + if src.agent_name: + out.append( + EvalGenerationSource( + type="agent", + agent_name=src.agent_name, + description=src.description, + ) + ) + continue + if agent is None: + raise ValueError(f"Spec {spec.name!r} has a source of type 'agent' but no agent= was provided.") + out.append( + agent_as_eval_source( + agent, + include_instructions=src.include_instructions, + include_tools=src.include_tools, + include_context_providers=src.include_context_providers, + include_examples=src.include_examples, + examples=list(src.examples) if src.examples else None, + ) + ) + elif src.type == "workflow": + if workflow is None: + raise ValueError(f"Spec {spec.name!r} has a source of type 'workflow' but no workflow= was provided.") + out.append( + workflow_as_eval_source( + workflow, + include_instructions=src.include_instructions, + include_tools=src.include_tools, + include_context_providers=src.include_context_providers, + include_examples=src.include_examples, + examples=list(src.examples) if src.examples else None, + include_topology=src.include_topology, + ) + ) + elif src.type == "prompt": + if not src.prompt: + raise ValueError(f"Spec {spec.name!r} has a 'prompt' source missing the 'prompt' field.") + out.append(EvalGenerationSource(type="prompt", prompt=src.prompt, description=src.description)) + elif src.type == "dataset": + if not src.name: + raise ValueError(f"Spec {spec.name!r} has a 'dataset' source missing the 'name' field.") + out.append( + EvalGenerationSource( + type="dataset", + dataset_name=src.name, + dataset_version=src.version, + description=src.description, + ) + ) + elif src.type == "traces": + out.append( + EvalGenerationSource( + type="traces", + description=src.description, + metadata=src.metadata, + ) + ) + else: # pragma: no cover - guarded by _parse_source + raise ValueError(f"Spec {spec.name!r} has unknown source type {src.type!r}.") + return out + + +__all__ = [ + "RubricGenerationSpec", + "RubricSourceSpec", + "build_sources", + "load_evals_config", + "parse_evals_config", +] diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index eef58b0a04..2b8d7913e0 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -29,7 +29,8 @@ import asyncio import logging from collections.abc import Sequence -from typing import TYPE_CHECKING, Any +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Literal, cast from agent_framework._evaluation import ( AgentEvalConverter, @@ -39,6 +40,7 @@ EvalItemResult, EvalResults, EvalScoreResult, + RubricScore, ) from agent_framework._feature_stage import ExperimentalFeature, experimental from openai import AsyncOpenAI @@ -46,11 +48,335 @@ from ._chat_client import FoundryChatClient if TYPE_CHECKING: + from agent_framework._agents import BaseAgent + from agent_framework._workflows._workflow import Workflow from azure.ai.projects.aio import AIProjectClient from openai.types.evals import RunRetrieveResponse logger = logging.getLogger(__name__) + +# region Generated rubric evaluator types + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class RubricDimension: + """A single dimension of a generated rubric evaluator. + + Rubric evaluators score each item along one or more named dimensions, + each with its own description and weight. Foundry's evaluator + generation pipeline produces these dimensions from agent/workflow + metadata; ``RubricDimension`` surfaces them so callers can inspect a + generated evaluator's structure without round-tripping through the + portal. + + Attributes: + id: Stable identifier for the dimension (e.g. ``"policy_enforcement"``). + description: Natural-language description of what the dimension scores. + weight: Integer weight controlling the dimension's contribution to + the aggregate score. + always_applicable: When ``False``, evaluators may mark this + dimension non-applicable on a per-item basis. + """ + + id: str + description: str + weight: int + always_applicable: bool = False + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class GeneratedEvaluatorRef: + """A reference to a generated rubric evaluator stored in Foundry. + + Pass instances of this class to :class:`FoundryEvals` to score items + with a previously generated rubric evaluator. Construct directly + when the evaluator already exists, or obtain one from + :meth:`FoundryEvals.generate_rubric`. + + Pinning ``version`` is strongly recommended so evaluation runs are + reproducible. The dataclass accepts ``version=None`` for the + convenience of :meth:`latest`, but ``FoundryEvals`` emits a warning + whenever a versionless reference is used; CI gates should always + pass a concrete version. + + Attributes: + name: Evaluator name as stored in the Foundry project (e.g. + ``"my-policy-evaluator"``). Distinct from built-in + evaluators such as ``"builtin.relevance"``. + version: Pinned evaluator version. ``None`` means "latest" — + this is discouraged for CI/repro and ``FoundryEvals`` will + emit a warning when used. + category: ``"quality"`` for ungrounded rubric scoring, + ``"safety"`` for safety-focused evaluators. Matches the + Foundry evaluator's declared category. + display_name: Optional human-readable name used in result + summaries. Defaults to ``name`` when unset. + description: Optional description carried over from the + generated evaluator definition for documentation. + dimensions: Optional snapshot of the rubric's dimensions for + inspection. Not required to invoke the evaluator — the + service uses the persisted definition. + pass_threshold: Optional aggregate score threshold (0.0-1.0) the + evaluator considers a passing item. ``None`` defers to the + evaluator's stored default. + """ + + name: str + version: str | None = None + category: Literal["quality", "safety"] = "quality" + display_name: str | None = None + description: str | None = None + dimensions: tuple[RubricDimension, ...] | None = None + pass_threshold: float | None = None + + @classmethod + def latest( + cls, + name: str, + *, + category: Literal["quality", "safety"] = "quality", + display_name: str | None = None, + description: str | None = None, + ) -> GeneratedEvaluatorRef: + """Construct a versionless reference (resolves to the latest version at run time). + + Discouraged for reproducible runs. Prefer the constructor with + an explicit ``version`` so CI and replay evaluations stay stable + when the evaluator is regenerated. + """ + return cls( + name=name, + version=None, + category=category, + display_name=display_name, + description=description, + ) + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class EvalGenerationSource: + """A source description passed to Foundry's evaluator generation pipeline. + + Rubric evaluator generation consumes one or more sources that describe + the agent or workflow under evaluation. ``FoundryEvals`` translates + instances into the underlying ``*EvaluatorGenerationJobSource`` SDK + types. + + Discriminated by :attr:`type`: + + * ``"prompt"`` - a free-form textual dossier (typical for local agents + and workflows whose tools cannot be fetched server-side). + * ``"agent"`` - a hosted Foundry agent referenced by name so the + service fetches tool definitions and metadata directly. + * ``"dataset"`` - a Foundry dataset of recorded interactions. + * ``"traces"`` - tracing data scoped by metadata. + + Only the fields relevant to :attr:`type` are populated; the remaining + fields stay ``None``. + + Attributes: + type: Source kind. See discriminator above. + description: Optional short description shown in Foundry UI. + prompt: Rendered dossier for ``type="prompt"`` sources. + agent_name: Hosted Foundry agent name for ``type="agent"`` sources. + agent_version: Optional pinned hosted-agent version for + ``type="agent"`` sources. ``None`` resolves to the latest + version at generation time; pin for reproducible runs. + dataset_name: Foundry dataset name for ``type="dataset"`` sources. + dataset_version: Pinned dataset version (recommended for repro). + metadata: Free-form metadata. Used by ``type="traces"`` sources + for tracing-attribute filters and as a generic escape hatch + for additional fields not yet modeled. + """ + + type: Literal["prompt", "dataset", "agent", "traces"] + description: str | None = None + prompt: str | None = None + agent_name: str | None = None + agent_version: str | None = None + dataset_name: str | None = None + dataset_version: str | None = None + metadata: dict[str, Any] | None = None + + +@experimental(feature_id=ExperimentalFeature.EVALS) +def agent_as_eval_source( + agent: BaseAgent, + *, + include_instructions: bool = True, + include_tools: bool = True, + include_context_providers: bool = False, + include_examples: bool = False, + examples: Sequence[str] | None = None, + hosted_agent_name: str | None = None, + hosted_agent_version: str | None = None, + force_prompt_source: bool = False, +) -> EvalGenerationSource: + """Render an agent as an :class:`EvalGenerationSource` for rubric generation. + + Picks the best Foundry source variant for the supplied agent: + + * **Hosted Foundry agents** (``FoundryAgent`` connected to a Prompt + Agent or Hosted Agent in a Foundry project) are emitted as + ``type="agent"`` sources keyed by ``agent_name`` so the service + fetches instructions, tools, and metadata directly from the agent + registry — independent of whatever the local wrapper happens to + hold. Detected automatically from ``agent.chat_client.agent_name`` + and ``agent.chat_client.agent_version``. + * **Local agents** (any other ``BaseAgent`` whose instructions and + tools live client-side, e.g. ``FoundryChatClient``-backed agents or + pure OpenAI Responses agents) are emitted as ``type="prompt"`` + sources with a rendered text dossier. + + Override the heuristic by passing ``hosted_agent_name`` explicitly + (forces an ``"agent"`` source) or ``force_prompt_source=True`` + (forces a ``"prompt"`` source — useful when you want the service to + score a hosted agent against the *local* wrapper's overrides). + + Args: + agent: Agent instance (typically a ``BaseAgent`` subclass). + include_instructions: Whether to include the agent's instructions + text in the dossier (``"prompt"`` sources only). Defaults to + ``True``. + include_tools: Whether to include tool definitions in the dossier + (``"prompt"`` sources only). Defaults to ``True``. + include_context_providers: Whether to include the names of + attached context-provider classes in the dossier + (``"prompt"`` sources only). Defaults to ``False`` to avoid + leaking implementation details. + include_examples: Whether to include the supplied ``examples`` in + the dossier (``"prompt"`` sources only). Defaults to + ``False`` to avoid shipping potentially sensitive sample + inputs by default. + examples: Optional sample queries / interactions to include when + ``include_examples`` is ``True``. + hosted_agent_name: When set, emit a ``type="agent"`` source + referencing this hosted Foundry agent name regardless of + auto-detection. Use to override or supplement the + heuristic. + hosted_agent_version: When set together with a hosted-agent + source, pins the source to a specific hosted-agent version. + Recommended for reproducible rubric generation against + PromptAgents. + force_prompt_source: When ``True``, always emit a + ``type="prompt"`` source with the rendered dossier even when + the agent is a hosted Foundry agent. Useful when the local + wrapper holds overrides the service-side agent doesn't see. + + Returns: + An :class:`EvalGenerationSource` describing the agent. + """ + agent_description = getattr(agent, "description", None) + + resolved_name = hosted_agent_name + resolved_version = hosted_agent_version + if resolved_name is None and not force_prompt_source: + detected_name, detected_version = _detect_hosted_foundry_agent(agent) + if detected_name is not None: + resolved_name = detected_name + if resolved_version is None: + resolved_version = detected_version + + if resolved_name is not None and not force_prompt_source: + return EvalGenerationSource( + type="agent", + agent_name=resolved_name, + agent_version=resolved_version, + description=agent_description, + ) + + prompt = agent.as_eval_source( + include_instructions=include_instructions, + include_tools=include_tools, + include_context_providers=include_context_providers, + include_examples=include_examples, + examples=examples, + ) + return EvalGenerationSource( + type="prompt", + prompt=prompt, + description=agent_description, + ) + + +def _detect_hosted_foundry_agent(agent: BaseAgent) -> tuple[str | None, str | None]: + """Return ``(agent_name, agent_version)`` for hosted Foundry agents, else ``(None, None)``. + + A hosted Foundry agent is one whose ``chat_client`` exposes a string + ``agent_name`` — the convention used by ``RawFoundryAgentChatClient`` + when ``FoundryAgent`` connects to an existing Prompt Agent or Hosted + Agent in a Foundry project. Only string values are accepted so + test doubles using ``MagicMock`` for ``chat_client`` are not + mis-detected. + """ + chat_client = getattr(agent, "chat_client", None) + if chat_client is None: + return None, None + name = getattr(chat_client, "agent_name", None) + version = getattr(chat_client, "agent_version", None) + if not isinstance(name, str) or not name: + return None, None + if not isinstance(version, str) or not version: + version = None + return name, version + + +@experimental(feature_id=ExperimentalFeature.EVALS) +def workflow_as_eval_source( + workflow: Workflow, + *, + include_instructions: bool = True, + include_tools: bool = True, + include_context_providers: bool = False, + include_examples: bool = False, + examples: Sequence[str] | None = None, + include_topology: bool = True, +) -> EvalGenerationSource: + """Render a workflow as an :class:`EvalGenerationSource` for rubric generation. + + Wraps :meth:`Workflow.as_eval_source` to package the workflow's + rendered dossier (workflow name, description, topology, per-agent + dossiers) into a typed ``type="prompt"`` Foundry generation source. + + Args: + workflow: Workflow instance to render. + include_instructions: Per-agent instructions inclusion. + include_tools: Per-agent tools inclusion. + include_context_providers: Per-agent context-provider inclusion. + Defaults to ``False``. + include_examples: Per-agent examples inclusion. Defaults to + ``False``. + examples: Optional workflow-level sample queries. Rendered into + a top-level ``Examples:`` section when ``include_examples`` is + ``True``. + include_topology: Whether to embed the JSON-encoded workflow + topology produced by :meth:`Workflow.to_dict`. Defaults to + ``True``. + + Returns: + A ``type="prompt"`` :class:`EvalGenerationSource` describing the + workflow. + """ + prompt = workflow.as_eval_source( + include_instructions=include_instructions, + include_tools=include_tools, + include_context_providers=include_context_providers, + include_examples=include_examples, + examples=examples, + include_topology=include_topology, + ) + return EvalGenerationSource( + type="prompt", + prompt=prompt, + description=workflow.description, + ) + + +# endregion # Agent evaluators that accept query/response as conversation arrays. # Maintained manually — check https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk # for the latest evaluator list. These are the evaluators that need conversation-format input. @@ -166,7 +492,7 @@ def _resolve_evaluator(name: str) -> str: def _build_testing_criteria( - evaluators: Sequence[str], + evaluators: Sequence[str | GeneratedEvaluatorRef], model: str, *, include_data_mapping: bool = False, @@ -175,7 +501,9 @@ def _build_testing_criteria( """Build ``testing_criteria`` for ``evals.create()``. Args: - evaluators: Evaluator names. + evaluators: Evaluator names (built-in shorts / fully-qualified + ``builtin.*`` names) or :class:`GeneratedEvaluatorRef` + instances for generated rubric evaluators. model: Model deployment for the LLM judge. include_data_mapping: Whether to include field-level data mapping (required for the JSONL data source, not needed for response-based). @@ -183,7 +511,38 @@ def _build_testing_criteria( definitions. """ criteria: list[dict[str, Any]] = [] - for name in evaluators: + for entry_spec in evaluators: + if isinstance(entry_spec, GeneratedEvaluatorRef): + short = entry_spec.display_name or entry_spec.name + ref_entry: dict[str, Any] = { + "type": "azure_ai_evaluator", + "name": short, + "evaluator_name": entry_spec.name, + "initialization_parameters": {"deployment_name": model}, + } + if entry_spec.version is not None: + ref_entry["evaluator_version"] = entry_spec.version + else: + logger.warning( + "GeneratedEvaluatorRef '%s' has no pinned version; the eval run " + "will resolve to whichever version is current at execution time. " + "Pin the version for reproducible runs.", + entry_spec.name, + ) + if include_data_mapping: + # Rubric evaluators accept conversation arrays like agent + # evaluators, plus tool_definitions when items are tool-aware. + ref_mapping: dict[str, str] = { + "query": "{{item.query_messages}}", + "response": "{{item.response_messages}}", + } + if include_tool_definitions: + ref_mapping["tool_definitions"] = "{{item.tool_definitions}}" + ref_entry["data_mapping"] = ref_mapping + criteria.append(ref_entry) + continue + + name = entry_spec qualified = _resolve_evaluator(name) short = name if not name.startswith("builtin.") else name.split(".")[-1] @@ -247,9 +606,9 @@ def _build_item_schema( def _resolve_default_evaluators( - evaluators: Sequence[str] | None, + evaluators: Sequence[str | GeneratedEvaluatorRef] | None, items: Sequence[EvalItem | dict[str, Any]] | None = None, -) -> list[str]: +) -> list[str | GeneratedEvaluatorRef]: """Resolve evaluators, applying defaults when ``None``. Defaults to relevance + coherence + task_adherence. Automatically adds @@ -258,7 +617,7 @@ def _resolve_default_evaluators( if evaluators is not None: return list(evaluators) - result = list(_DEFAULT_EVALUATORS) + result: list[str | GeneratedEvaluatorRef] = list(_DEFAULT_EVALUATORS) if items is not None: has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items) if has_tools: @@ -267,14 +626,24 @@ def _resolve_default_evaluators( def _filter_tool_evaluators( - evaluators: list[str], + evaluators: list[str | GeneratedEvaluatorRef], items: Sequence[EvalItem | dict[str, Any]], -) -> list[str]: - """Remove tool evaluators if no items have tool definitions.""" +) -> list[str | GeneratedEvaluatorRef]: + """Remove tool evaluators if no items have tool definitions. + + Generated rubric evaluators are tool-aware but not tool-required; they + are preserved regardless of whether items carry tool definitions. + """ has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items) if has_tools: return evaluators - filtered = [e for e in evaluators if _resolve_evaluator(e) not in _TOOL_EVALUATORS] + + def _is_tool_only(spec: str | GeneratedEvaluatorRef) -> bool: + if isinstance(spec, GeneratedEvaluatorRef): + return False + return _resolve_evaluator(spec) in _TOOL_EVALUATORS + + filtered = [e for e in evaluators if not _is_tool_only(e)] if not filtered: raise ValueError( f"All requested evaluators {evaluators} require tool definitions, " @@ -282,7 +651,7 @@ def _filter_tool_evaluators( "or choose evaluators that do not require tools." ) if len(filtered) < len(evaluators): - removed = [e for e in evaluators if _resolve_evaluator(e) in _TOOL_EVALUATORS] + removed = [e for e in evaluators if _is_tool_only(e)] logger.info("Removed tool evaluators %s (no items have tools)", removed) return filtered @@ -354,6 +723,79 @@ def _extract_per_evaluator(run: RunRetrieveResponse) -> dict[str, dict[str, int] return per_eval +def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None: + """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload. + + Foundry rubric evaluators include a per-dimension breakdown under + ``properties.rubric_scores`` on each result. The exact location may + vary across SDK versions, so this helper accepts a few shapes: + + * The SDK ``sample`` object exposes ``properties.rubric_scores``. + * The ``sample`` is a dict containing ``properties.rubric_scores``. + * The ``sample`` is a dict with ``rubric_scores`` at the top level. + + Returns ``None`` when no rubric scores are present (i.e. the + evaluator was not a rubric evaluator). + """ + if sample is None: + return None + + raw: Any = None + properties: Any = getattr(sample, "properties", None) + if properties is not None: + raw = getattr(properties, "rubric_scores", None) + if raw is None and isinstance(properties, dict): + raw = cast("dict[str, Any]", properties).get("rubric_scores") + if raw is None and isinstance(sample, dict): + sample_any = cast("dict[str, Any]", sample) + props_dict: Any = sample_any.get("properties") + if isinstance(props_dict, dict): + raw = cast("dict[str, Any]", props_dict).get("rubric_scores") + if raw is None: + raw = sample_any.get("rubric_scores") + + if not raw: + return None + + parsed: list[RubricScore] = [] + raw_iter: Any = raw + for raw_entry in raw_iter: + entry: Any = raw_entry + try: + rid: Any + score_val: Any + applicable: Any + weight: Any + reason: Any + if isinstance(entry, dict): + entry_any = cast("dict[str, Any]", entry) + rid = entry_any.get("id") + score_val = entry_any.get("score") + applicable = entry_any.get("applicable") + weight = entry_any.get("weight") + reason = entry_any.get("reason", "") + else: + rid = getattr(entry, "id", None) + score_val = getattr(entry, "score", None) + applicable = getattr(entry, "applicable", None) + weight = getattr(entry, "weight", None) + reason = getattr(entry, "reason", "") or "" + if rid is None or weight is None or applicable is None: + continue + parsed.append( + RubricScore( + id=str(rid), + score=int(score_val) if isinstance(score_val, (int, float)) else None, + applicable=bool(applicable), + weight=int(weight), + reason=str(reason) if reason is not None else "", + ) + ) + except (TypeError, ValueError): + logger.debug("Skipping malformed rubric_scores entry: %s", cast("Any", entry), exc_info=True) + return parsed or None + + async def _fetch_output_items( client: AsyncOpenAI, eval_id: str, @@ -377,12 +819,15 @@ async def _fetch_output_items( # Extract per-evaluator scores scores: list[EvalScoreResult] = [] for r in oi.results or []: + sample = r.sample + dimensions = _extract_rubric_scores(sample) scores.append( EvalScoreResult( name=r.name, score=r.score, passed=r.passed, - sample=r.sample, + sample=sample, + dimensions=dimensions, ) ) @@ -472,7 +917,7 @@ async def _evaluate_via_responses_impl( *, client: AsyncOpenAI, response_ids: Sequence[str], - evaluators: list[str], + evaluators: list[str | GeneratedEvaluatorRef], model: str, eval_name: str, poll_interval: float, @@ -573,8 +1018,11 @@ class FoundryEvals: (from ``azure.ai.projects.aio``). Provide this or *client*. model: Model deployment name for the evaluator LLM judge. Resolved from ``client.model`` when omitted. - evaluators: Evaluator names (e.g. ``["relevance", "tool_call_accuracy"]``). - When ``None`` (default), uses smart defaults based on item data. + evaluators: Evaluator specifications. Entries may be built-in + short names (e.g. ``"relevance"``), fully-qualified + ``"builtin.*"`` names, or :class:`GeneratedEvaluatorRef` + instances for previously generated rubric evaluators. When + ``None`` (default), uses smart defaults based on item data. conversation_split: How to split multi-turn conversations into query/response halves. Defaults to ``LAST_TURN``. Pass a ``ConversationSplit`` enum value or a custom callable — see @@ -623,7 +1071,7 @@ def __init__( client: FoundryChatClient | None = None, project_client: AIProjectClient | None = None, model: str | None = None, - evaluators: Sequence[str] | None = None, + evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None, conversation_split: ConversationSplitter = ConversationSplit.LAST_TURN, poll_interval: float = 5.0, timeout: float = 180.0, @@ -642,7 +1090,9 @@ def __init__( "Model is required. Pass model= explicitly or use a FoundryChatClient that has a model configured." ) self._model = resolved_model - self._evaluators = list(evaluators) if evaluators is not None else None + self._evaluators: list[str | GeneratedEvaluatorRef] | None = ( + list(evaluators) if evaluators is not None else None + ) self._conversation_split = conversation_split self._poll_interval = poll_interval self._timeout = timeout @@ -678,7 +1128,7 @@ async def evaluate( async def _evaluate_via_dataset( self, items: Sequence[EvalItem], - evaluators: list[str], + evaluators: list[str | GeneratedEvaluatorRef], eval_name: str, ) -> EvalResults: """Evaluate using JSONL dataset upload path.""" @@ -752,6 +1202,334 @@ async def _evaluate_via_dataset( provider=self.name, ) + @classmethod + @experimental(feature_id=ExperimentalFeature.EVALS) + async def generate_rubric( + cls, + *, + project_client: AIProjectClient, + name: str, + agent: BaseAgent | None = None, + workflow: Workflow | None = None, + sources: Sequence[EvalGenerationSource] | None = None, + category: Literal["quality", "safety"] = "quality", + model: str | None = None, + display_name: str | None = None, + description: str | None = None, + operation_id: str | None = None, + poll_interval: float = 5.0, + timeout: float = 600.0, + ) -> GeneratedEvaluatorRef: + """Generate a Foundry rubric evaluator from an agent or workflow. + + Drives the Foundry evaluator-generation long-running operation + (``client.beta.evaluators.create_generation_job``) end-to-end and + returns a pinned :class:`GeneratedEvaluatorRef` for use with + :class:`FoundryEvals` ``evaluators=`` lists. + + Exactly one of ``agent``, ``workflow``, or ``sources`` must be + supplied. When ``agent`` or ``workflow`` is given, + :func:`agent_as_eval_source` / :func:`workflow_as_eval_source` is + used to build a single conservative source (instructions and + tools included; examples and context providers excluded). Pass + ``sources=`` directly to control inclusion explicitly or to + provide multiple sources. + + Requires ``azure-ai-projects`` with the rubric-generation APIs + (currently ``2.3.0a*`` on the Azure SDK dev feed; tracked for an + upcoming PyPI release). Raises :class:`NotImplementedError` with + a clear message when the dependency is unavailable. + + Keyword Args: + project_client: Async ``AIProjectClient`` for the target + Foundry project. + name: Evaluator name to register in the project. Must be a + stable identifier (e.g. ``"policy-enforcement-v1"``). + agent: Optional ``BaseAgent`` to derive a source from. + workflow: Optional ``Workflow`` to derive a source from. + sources: Explicit list of :class:`EvalGenerationSource` + instances. Mutually exclusive with ``agent`` / ``workflow``. + category: ``"quality"`` or ``"safety"``. Defaults to + ``"quality"``. + model: Optional model deployment to drive generation. When + omitted the service picks a default. + display_name: Optional human-readable name for the evaluator. + description: Optional description for the evaluator. + operation_id: Optional caller-supplied operation id to make + the create call idempotent. + poll_interval: Seconds between job-status polls. + timeout: Maximum seconds to wait for the job to complete. + + Returns: + A pinned :class:`GeneratedEvaluatorRef` referring to the + newly created evaluator. + + Raises: + ValueError: If the source arguments are inconsistent. + NotImplementedError: If the installed ``azure-ai-projects`` + version does not expose the rubric APIs. + TimeoutError: If the job does not complete within ``timeout``. + RuntimeError: If the generation job ends in a non-succeeded + terminal state. + """ + resolved_sources = _coalesce_generation_sources(agent=agent, workflow=workflow, sources=sources) + + if category not in ("quality", "safety"): + raise ValueError(f"category must be 'quality' or 'safety', got {category!r}.") + + try: + sdk_types = _import_generation_sdk_types() + except _RubricSdkUnavailableError as exc: + raise NotImplementedError(str(exc)) from exc + + sdk_sources = [_to_sdk_source(s, sdk_types) for s in resolved_sources] + + inputs_kwargs: dict[str, Any] = { + "name": name, + "category": category, + "sources": sdk_sources, + } + if model is not None: + inputs_kwargs["model"] = model + if display_name is not None: + inputs_kwargs["display_name"] = display_name + if description is not None: + inputs_kwargs["description"] = description + + inputs = sdk_types.EvaluatorGenerationInputs(**inputs_kwargs) + job = sdk_types.EvaluatorGenerationJob(inputs=inputs) + + create_kwargs: dict[str, Any] = {"job": job} + if operation_id is not None: + create_kwargs["operation_id"] = operation_id + + evaluators_ops = _get_beta_evaluators(project_client) + created = await evaluators_ops.create_generation_job(**create_kwargs) + completed = await _poll_generation_job( + evaluators_ops, + created, + poll_interval=poll_interval, + timeout=timeout, + ) + + return _generation_job_to_ref(completed, category=category) + + +_TERMINAL_GENERATION_STATUSES: frozenset[str] = frozenset({"succeeded", "failed", "cancelled", "canceled"}) + + +class _RubricSdkUnavailableError(Exception): + """Raised when azure-ai-projects lacks the rubric-generation APIs.""" + + +@dataclass(frozen=True) +class _GenerationSdkTypes: + """Resolved SDK type handles for rubric-evaluator generation.""" + + EvaluatorGenerationInputs: Any + EvaluatorGenerationJob: Any + PromptSource: Any + AgentSource: Any | None + DatasetSource: Any | None + TracesSource: Any | None + + +_RUBRIC_SDK_MISSING_MSG = ( + "FoundryEvals.generate_rubric requires the rubric-evaluator generation APIs " + "from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev feed). " + "Install a build that exposes " + "`azure.ai.projects.models.EvaluatorGenerationInputs` and " + "`AIProjectClient.beta.evaluators.create_generation_job`." +) + + +def _import_generation_sdk_types() -> _GenerationSdkTypes: + """Lazily resolve the rubric-generation SDK types from azure-ai-projects.""" + try: + from azure.ai.projects import models as _models # type: ignore[import-not-found] + except ImportError as exc: + raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG) from exc + + models_mod: Any = _models + inputs_cls: Any = getattr(models_mod, "EvaluatorGenerationInputs", None) + job_cls: Any = getattr(models_mod, "EvaluatorGenerationJob", None) + prompt_cls: Any = getattr(models_mod, "PromptEvaluatorGenerationJobSource", None) + if inputs_cls is None or job_cls is None or prompt_cls is None: + raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG) + + agent_cls: Any = getattr(models_mod, "AgentEvaluatorGenerationJobSource", None) + dataset_cls: Any = getattr(models_mod, "DatasetEvaluatorGenerationJobSource", None) + traces_cls: Any = getattr(models_mod, "TracesEvaluatorGenerationJobSource", None) + + return _GenerationSdkTypes( + EvaluatorGenerationInputs=inputs_cls, + EvaluatorGenerationJob=job_cls, + PromptSource=prompt_cls, + AgentSource=agent_cls, + DatasetSource=dataset_cls, + TracesSource=traces_cls, + ) + + +def _get_beta_evaluators(project_client: AIProjectClient) -> Any: + """Return the ``project_client.beta.evaluators`` operations group, or raise.""" + beta = getattr(project_client, "beta", None) + evaluators_ops = getattr(beta, "evaluators", None) if beta is not None else None + if evaluators_ops is None: + raise NotImplementedError(_RUBRIC_SDK_MISSING_MSG) + return evaluators_ops + + +def _coalesce_generation_sources( + *, + agent: BaseAgent | None, + workflow: Workflow | None, + sources: Sequence[EvalGenerationSource] | None, +) -> list[EvalGenerationSource]: + if sources is not None and not sources: + raise ValueError("sources= must contain at least one EvalGenerationSource.") + supplied = [bool(agent), bool(workflow), bool(sources)] + if sum(supplied) == 0: + raise ValueError("Provide one of agent=, workflow=, or sources=.") + if sum(supplied) > 1: + raise ValueError("Provide only one of agent=, workflow=, or sources=.") + if sources is not None: + return list(sources) + if agent is not None: + return [agent_as_eval_source(agent)] + if workflow is None: + raise ValueError("workflow= must be provided when agent= and sources= are not set.") + return [workflow_as_eval_source(workflow)] + + +def _to_sdk_source(source: EvalGenerationSource, sdk_types: _GenerationSdkTypes) -> Any: + """Translate an :class:`EvalGenerationSource` to its SDK counterpart.""" + if source.type == "prompt": + if not source.prompt: + raise ValueError("EvalGenerationSource(type='prompt') requires a non-empty prompt.") + kwargs: dict[str, Any] = {"prompt": source.prompt} + if source.description is not None: + kwargs["description"] = source.description + return sdk_types.PromptSource(**kwargs) + if source.type == "agent": + if sdk_types.AgentSource is None: + raise NotImplementedError("Installed azure-ai-projects does not expose AgentEvaluatorGenerationJobSource.") + if not source.agent_name: + raise ValueError("EvalGenerationSource(type='agent') requires agent_name.") + kwargs = {"agent_name": source.agent_name} + if source.agent_version is not None: + kwargs["agent_version"] = source.agent_version + if source.description is not None: + kwargs["description"] = source.description + return sdk_types.AgentSource(**kwargs) + if source.type == "dataset": + if sdk_types.DatasetSource is None: + raise NotImplementedError( + "Installed azure-ai-projects does not expose DatasetEvaluatorGenerationJobSource." + ) + if not source.dataset_name: + raise ValueError("EvalGenerationSource(type='dataset') requires dataset_name.") + # SDK uses ``name`` / ``version`` (not ``dataset_name`` / ``dataset_version``). + kwargs = {"name": source.dataset_name} + if source.dataset_version is not None: + kwargs["version"] = source.dataset_version + if source.description is not None: + kwargs["description"] = source.description + return sdk_types.DatasetSource(**kwargs) + if source.type == "traces": + if sdk_types.TracesSource is None: + raise NotImplementedError("Installed azure-ai-projects does not expose TracesEvaluatorGenerationJobSource.") + kwargs = {} + if source.metadata is not None: + kwargs["metadata"] = source.metadata + if source.description is not None: + kwargs["description"] = source.description + return sdk_types.TracesSource(**kwargs) + raise ValueError(f"Unknown EvalGenerationSource type: {source.type!r}") + + +async def _poll_generation_job( + evaluators_ops: Any, + job: Any, + *, + poll_interval: float, + timeout: float, +) -> Any: + """Poll a rubric-generation job until it reaches a terminal state.""" + job_id = getattr(job, "id", None) + if not job_id: + raise RuntimeError("Rubric generation job did not return an id.") + + loop = asyncio.get_running_loop() + deadline = loop.time() + timeout + current = job + while True: + status = (getattr(current, "status", "") or "").lower() + if status in _TERMINAL_GENERATION_STATUSES: + if status != "succeeded": + err = getattr(current, "error", None) + err_msg = getattr(err, "message", None) or str(err) if err is not None else status + raise RuntimeError(f"Rubric generation job {job_id} ended in status {status!r}: {err_msg}") + return current + remaining = deadline - loop.time() + if remaining <= 0: + raise TimeoutError( + f"Rubric generation job {job_id} did not complete within {timeout}s (last status: {status!r})." + ) + await asyncio.sleep(min(poll_interval, remaining)) + current = await evaluators_ops.get_generation_job(job_id) + + +def _generation_job_to_ref(job: Any, *, category: Literal["quality", "safety"]) -> GeneratedEvaluatorRef: + """Build a pinned :class:`GeneratedEvaluatorRef` from a completed job.""" + artifacts: Any = getattr(job, "artifacts", None) + evaluator: Any = getattr(artifacts, "evaluator", None) if artifacts is not None else None + if evaluator is None: + raise RuntimeError("Rubric generation job completed without an evaluator artifact.") + + ev_name = getattr(evaluator, "name", None) + ev_version = getattr(evaluator, "version", None) + if not ev_name: + raise RuntimeError("Generated evaluator artifact is missing a name.") + if ev_version is None: + raise RuntimeError("Generated evaluator artifact is missing a version.") + + definition: Any = getattr(evaluator, "definition", None) + dimensions_raw: Any = getattr(definition, "dimensions", None) if definition is not None else None + dimensions: tuple[RubricDimension, ...] | None = None + if dimensions_raw: + parsed: list[RubricDimension] = [] + for entry in dimensions_raw: + try: + parsed.append( + RubricDimension( + id=str(getattr(entry, "id", "") or ""), + description=str(getattr(entry, "description", "") or ""), + weight=int(getattr(entry, "weight", 0) or 0), + always_applicable=bool(getattr(entry, "always_applicable", False)), + ) + ) + except (TypeError, ValueError): + logger.debug("Skipping malformed dimension on generated evaluator", exc_info=True) + if parsed: + dimensions = tuple(parsed) + + pass_threshold: float | None = None + if definition is not None: + raw_threshold = getattr(definition, "pass_threshold", None) + if isinstance(raw_threshold, (int, float)): + pass_threshold = float(raw_threshold) + + return GeneratedEvaluatorRef( + name=str(ev_name), + version=str(ev_version), + category=category, + display_name=getattr(evaluator, "display_name", None), + description=getattr(evaluator, "description", None), + dimensions=dimensions, + pass_threshold=pass_threshold, + ) + # --------------------------------------------------------------------------- # Foundry-specific functions (not part of the Evaluator protocol) diff --git a/python/packages/foundry/tests/test_evals_config.py b/python/packages/foundry/tests/test_evals_config.py new file mode 100644 index 0000000000..a1c86187d4 --- /dev/null +++ b/python/packages/foundry/tests/test_evals_config.py @@ -0,0 +1,273 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Tests for the YAML-driven evaluator configuration loader.""" + +from __future__ import annotations + +import textwrap +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock + +import pytest + +from agent_framework_foundry._evals_config import ( + RubricGenerationSpec, + RubricSourceSpec, + build_sources, + load_evals_config, + parse_evals_config, +) +from agent_framework_foundry._foundry_evals import EvalGenerationSource + + +def _make_agent(name: str = "agent-a", instructions: str = "Be brief.") -> Any: + from agent_framework._evaluation import _render_agent_dossier + + agent = MagicMock() + agent.name = name + agent.description = f"{name} description" + agent.default_options = {"instructions": instructions, "tools": []} + agent.context_providers = [] + agent.mcp_tools = [] + agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier( + agent, + include_instructions=kw.get("include_instructions", True), + include_tools=kw.get("include_tools", True), + include_context_providers=kw.get("include_context_providers", False), + include_examples=kw.get("include_examples", False), + examples=kw.get("examples"), + ) + return agent + + +def _make_workflow() -> Any: + from agent_framework._evaluation import _render_workflow_dossier + + workflow = MagicMock() + workflow.name = "wf-1" + workflow.description = "demo" + workflow.to_dict.return_value = {"name": "wf-1", "id": "wf_1", "executors": {}, "edge_groups": []} + workflow.executors = {} + workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier( + workflow, + include_instructions=kw.get("include_instructions", True), + include_tools=kw.get("include_tools", True), + include_context_providers=kw.get("include_context_providers", False), + include_examples=kw.get("include_examples", False), + examples=kw.get("examples"), + include_topology=kw.get("include_topology", True), + ) + return workflow + + +class TestParseEvalsConfig: + """Parsing already-loaded dicts into RubricGenerationSpec instances.""" + + def test_minimal_spec(self) -> None: + config = parse_evals_config({ + "evaluators": { + "my-rubric": { + "type": "foundry.generated_rubric", + } + } + }) + assert "my-rubric" in config + spec = config["my-rubric"] + assert spec.name == "my-rubric" + assert spec.type == "foundry.generated_rubric" + assert spec.category == "quality" + assert spec.sources == () + + def test_full_spec_with_sources(self) -> None: + config = parse_evals_config({ + "evaluators": { + "reservation-quality": { + "type": "foundry.generated_rubric", + "category": "quality", + "model": "gpt-4o", + "agent": "reservation-agent", + "display_name": "Reservation Quality", + "description": "Custom rubric for reservation agent.", + "sources": [ + { + "type": "agent", + "include_instructions": True, + "include_tools": True, + "include_context_providers": True, + }, + { + "type": "dataset", + "name": "reservation-business-rules", + "version": 1, + }, + ], + } + } + }) + spec = config["reservation-quality"] + assert spec.model == "gpt-4o" + assert spec.agent == "reservation-agent" + assert spec.display_name == "Reservation Quality" + assert len(spec.sources) == 2 + + agent_src = spec.sources[0] + assert agent_src.type == "agent" + assert agent_src.include_context_providers is True + + dataset_src = spec.sources[1] + assert dataset_src.type == "dataset" + assert dataset_src.name == "reservation-business-rules" + assert dataset_src.version == "1" # coerced to string + + def test_rejects_non_mapping(self) -> None: + with pytest.raises(ValueError, match="must be a mapping"): + parse_evals_config([]) + + def test_rejects_missing_evaluators_key(self) -> None: + with pytest.raises(ValueError, match="evaluators"): + parse_evals_config({"other": {}}) + + def test_rejects_unknown_type(self) -> None: + with pytest.raises(ValueError, match="unsupported type"): + parse_evals_config({"evaluators": {"x": {"type": "foundry.other"}}}) + + def test_rejects_invalid_category(self) -> None: + with pytest.raises(ValueError, match="invalid category"): + parse_evals_config({"evaluators": {"x": {"type": "foundry.generated_rubric", "category": "bogus"}}}) + + def test_rejects_invalid_source_type(self) -> None: + with pytest.raises(ValueError, match="invalid type"): + parse_evals_config({ + "evaluators": { + "x": { + "type": "foundry.generated_rubric", + "sources": [{"type": "bogus"}], + } + } + }) + + +class TestLoadEvalsConfig: + """End-to-end YAML loading.""" + + def test_load_from_yaml_file(self, tmp_path: Path) -> None: + pytest.importorskip("yaml") + config_path = tmp_path / "evals.yaml" + config_path.write_text( + textwrap.dedent( + """\ + evaluators: + my-eval: + type: foundry.generated_rubric + category: safety + model: gpt-4o-mini + sources: + - type: prompt + prompt: "Score the response." + """ + ), + encoding="utf-8", + ) + config = load_evals_config(config_path) + assert "my-eval" in config + spec = config["my-eval"] + assert spec.category == "safety" + assert spec.model == "gpt-4o-mini" + assert len(spec.sources) == 1 + assert spec.sources[0].type == "prompt" + assert spec.sources[0].prompt == "Score the response." + + +class TestBuildSources: + """Translate RubricGenerationSpec sources into EvalGenerationSource instances.""" + + def test_no_sources_with_agent_default(self) -> None: + spec = RubricGenerationSpec(name="x") + agent = _make_agent() + sources = build_sources(spec, agent=agent) + assert len(sources) == 1 + assert sources[0].type == "prompt" + assert sources[0].prompt is not None + assert "Agent name: agent-a" in sources[0].prompt + + def test_no_sources_with_workflow_default(self) -> None: + spec = RubricGenerationSpec(name="x") + workflow = _make_workflow() + sources = build_sources(spec, workflow=workflow) + assert len(sources) == 1 + assert sources[0].type == "prompt" + assert sources[0].prompt is not None + assert "Workflow name: wf-1" in sources[0].prompt + + def test_no_sources_no_agent_or_workflow_raises(self) -> None: + spec = RubricGenerationSpec(name="x") + with pytest.raises(ValueError, match="no sources"): + build_sources(spec) + + def test_agent_source_uses_supplied_agent(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="agent", include_context_providers=True),), + ) + agent = _make_agent() + sources = build_sources(spec, agent=agent) + assert sources[0].type == "prompt" + assert sources[0].prompt is not None + assert "Agent name: agent-a" in sources[0].prompt + + def test_agent_source_with_agent_name_uses_hosted_path(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="agent", agent_name="hosted-foundry-agent"),), + ) + sources = build_sources(spec) + assert sources[0].type == "agent" + assert sources[0].agent_name == "hosted-foundry-agent" + + def test_agent_source_without_agent_raises(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="agent"),), + ) + with pytest.raises(ValueError, match="no agent="): + build_sources(spec) + + def test_workflow_source_uses_supplied_workflow(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="workflow", include_topology=False),), + ) + workflow = _make_workflow() + sources = build_sources(spec, workflow=workflow) + assert sources[0].type == "prompt" + assert sources[0].prompt is not None + assert "Workflow name: wf-1" in sources[0].prompt + assert "Topology (JSON):" not in sources[0].prompt + + def test_prompt_source_translates_directly(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="prompt", prompt="Score it."),), + ) + sources = build_sources(spec) + assert sources[0] == EvalGenerationSource(type="prompt", prompt="Score it.") + + def test_dataset_source_translates(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="dataset", name="ds", version="2"),), + ) + sources = build_sources(spec) + assert sources[0].type == "dataset" + assert sources[0].dataset_name == "ds" + assert sources[0].dataset_version == "2" + + def test_traces_source_passes_metadata(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="traces", metadata={"environment": "prod"}),), + ) + sources = build_sources(spec) + assert sources[0].type == "traces" + assert sources[0].metadata == {"environment": "prod"} diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index a5d9f2e864..7244347e05 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -6,7 +6,7 @@ import json from dataclasses import dataclass -from typing import Any +from typing import Any, cast from unittest.mock import AsyncMock, MagicMock import pytest @@ -64,6 +64,32 @@ def _make_tool(name: str) -> MagicMock: return t +def _make_stub_agent( + *, + name: str = "alpha", + description: str = "An agent.", + instructions: str = "Be brief.", +) -> MagicMock: + """Mock agent whose as_eval_source returns a real dossier string.""" + from agent_framework._evaluation import _render_agent_dossier + + agent = MagicMock() + agent.name = name + agent.description = description + agent.default_options = {"instructions": instructions, "tools": []} + agent.context_providers = [] + agent.mcp_tools = [] + agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier( + agent, + include_instructions=kw.get("include_instructions", True), + include_tools=kw.get("include_tools", True), + include_context_providers=kw.get("include_context_providers", False), + include_examples=kw.get("include_examples", False), + examples=kw.get("examples"), + ) + return agent + + @dataclass class _MockResultCounts: """Mock matching the OpenAI SDK ResultCounts Pydantic model shape.""" @@ -806,6 +832,73 @@ def test_all_tool_evaluators_include_tool_definitions(self) -> None: for c in criteria: assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions" + def test_generated_evaluator_ref_pinned_version(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="my-rubric", version="1") + criteria = _build_testing_criteria([ref], "gpt-4o", include_data_mapping=True) + + assert len(criteria) == 1 + c = criteria[0] + assert c["type"] == "azure_ai_evaluator" + assert c["evaluator_name"] == "my-rubric" + assert c["evaluator_version"] == "1" + assert c["name"] == "my-rubric" + assert c["initialization_parameters"] == {"deployment_name": "gpt-4o"} + assert c["data_mapping"] == { + "query": "{{item.query_messages}}", + "response": "{{item.response_messages}}", + } + + def test_generated_evaluator_ref_display_name_used_as_short(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="my-rubric", version="2", display_name="My Rubric") + criteria = _build_testing_criteria([ref], "gpt-4o") + + assert criteria[0]["name"] == "My Rubric" + assert criteria[0]["evaluator_name"] == "my-rubric" + + def test_generated_evaluator_ref_tool_definitions_added(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="my-rubric", version="1") + criteria = _build_testing_criteria( + [ref], + "gpt-4o", + include_data_mapping=True, + include_tool_definitions=True, + ) + + assert criteria[0]["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}" + + def test_generated_evaluator_ref_unpinned_warns(self, caplog: pytest.LogCaptureFixture) -> None: + import logging + + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef.latest("my-rubric") + with caplog.at_level(logging.WARNING, logger="agent_framework_foundry._foundry_evals"): + criteria = _build_testing_criteria([ref], "gpt-4o") + + assert "evaluator_version" not in criteria[0] + assert any("no pinned version" in r.message for r in caplog.records) + + def test_generated_evaluator_ref_mixed_with_builtins(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="my-rubric", version="1") + criteria = _build_testing_criteria( + ["relevance", ref, "task_adherence"], + "gpt-4o", + include_data_mapping=True, + ) + + assert [c["name"] for c in criteria] == ["relevance", "my-rubric", "task_adherence"] + assert criteria[0]["evaluator_name"] == "builtin.relevance" + assert criteria[1]["evaluator_name"] == "my-rubric" + assert criteria[2]["evaluator_name"] == "builtin.task_adherence" + # --------------------------------------------------------------------------- # _build_item_schema @@ -1263,6 +1356,31 @@ def test_raises_when_all_filtered(self) -> None: items, ) + def test_preserves_generated_ref_when_no_tools(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="rubric", version="1") + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]), + ] + result = _filter_tool_evaluators( + ["relevance", ref, "tool_call_accuracy"], + items, + ) + assert "relevance" in result + assert ref in result + assert "tool_call_accuracy" not in result + + def test_generated_ref_alone_does_not_raise(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="rubric", version="1") + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]), + ] + result = _filter_tool_evaluators([ref], items) + assert result == [ref] + # --------------------------------------------------------------------------- # EvalResults @@ -2369,6 +2487,124 @@ async def test_handles_api_failure_gracefully(self) -> None: items = await _fetch_output_items(mock_client, "eval_1", "run_1") assert items == [] + async def test_extracts_rubric_scores_from_dict_sample(self) -> None: + from agent_framework_foundry._foundry_evals import _fetch_output_items + + mock_result = MagicMock() + mock_result.name = "my-rubric" + mock_result.score = 0.85 + mock_result.passed = True + mock_result.sample = { + "properties": { + "rubric_scores": [ + {"id": "policy", "score": 4, "applicable": True, "weight": 1, "reason": "ok"}, + {"id": "safety", "score": None, "applicable": False, "weight": 1, "reason": "n/a"}, + ] + } + } + + mock_oi = MagicMock() + mock_oi.id = "oi_1" + mock_oi.status = "pass" + mock_oi.results = [mock_result] + mock_oi.sample = None + mock_oi.datasource_item = {} + + mock_client = MagicMock() + mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi])) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + + assert len(items) == 1 + scores = items[0].scores + assert len(scores) == 1 + assert scores[0].dimensions is not None + assert len(scores[0].dimensions) == 2 + policy = next(d for d in scores[0].dimensions if d.id == "policy") + assert policy.score == 4 + assert policy.applicable is True + assert policy.weight == 1 + assert policy.reason == "ok" + safety = next(d for d in scores[0].dimensions if d.id == "safety") + assert safety.score is None + assert safety.applicable is False + + async def test_no_rubric_scores_when_absent(self) -> None: + from agent_framework_foundry._foundry_evals import _fetch_output_items + + mock_result = MagicMock() + mock_result.name = "relevance" + mock_result.score = 0.85 + mock_result.passed = True + mock_result.sample = None + + mock_oi = MagicMock() + mock_oi.id = "oi_2" + mock_oi.status = "pass" + mock_oi.results = [mock_result] + mock_oi.sample = None + mock_oi.datasource_item = {} + + mock_client = MagicMock() + mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi])) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + + assert items[0].scores[0].dimensions is None + + +class TestExtractRubricScores: + def test_handles_attribute_style_properties(self) -> None: + from agent_framework_foundry._foundry_evals import _extract_rubric_scores + + rs = MagicMock() + rs.id = "policy" + rs.score = 5 + rs.applicable = True + rs.weight = 2 + rs.reason = "ok" + + sample = MagicMock() + sample.properties = MagicMock() + sample.properties.rubric_scores = [rs] + + result = _extract_rubric_scores(sample) + assert result is not None + assert result[0].id == "policy" + assert result[0].score == 5 + assert result[0].weight == 2 + + def test_top_level_rubric_scores_in_dict(self) -> None: + from agent_framework_foundry._foundry_evals import _extract_rubric_scores + + sample = {"rubric_scores": [{"id": "a", "score": 3, "applicable": True, "weight": 1, "reason": "r"}]} + result = _extract_rubric_scores(sample) + assert result is not None + assert result[0].id == "a" + + def test_returns_none_when_missing(self) -> None: + from agent_framework_foundry._foundry_evals import _extract_rubric_scores + + assert _extract_rubric_scores(None) is None + assert _extract_rubric_scores({}) is None + assert _extract_rubric_scores({"properties": {}}) is None + + def test_skips_malformed_entries(self) -> None: + from agent_framework_foundry._foundry_evals import _extract_rubric_scores + + sample = { + "properties": { + "rubric_scores": [ + {"id": "good", "score": 3, "applicable": True, "weight": 1, "reason": "ok"}, + {"id": "bad-no-weight", "score": 2, "applicable": True, "reason": "x"}, + ] + } + } + result = _extract_rubric_scores(sample) + assert result is not None + assert len(result) == 1 + assert result[0].id == "good" + # --------------------------------------------------------------------------- # _poll_eval_run — timeout / failed / canceled paths @@ -2758,3 +2994,489 @@ async def test_target_without_type_raises(self) -> None: client=mock_client, model="gpt-4o", ) + + +class TestFoundryAgentAsEvalSource: + """Tests for foundry's agent_as_eval_source helper (wraps BaseAgent.as_eval_source).""" + + def test_returns_prompt_source_with_dossier(self) -> None: + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") + source = agent_as_eval_source(agent) + assert source.type == "prompt" + assert source.description == "Looks up the weather." + assert source.prompt is not None + assert "Agent name: weather-bot" in source.prompt + assert "Be brief." in source.prompt + + def test_hosted_agent_name_emits_agent_source(self) -> None: + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") + source = agent_as_eval_source(agent, hosted_agent_name="weather-bot-hosted-id") + assert source.type == "agent" + assert source.agent_name == "weather-bot-hosted-id" + assert source.prompt is None + assert source.description == "Looks up the weather." + + def test_explicit_hosted_agent_version_forwarded(self) -> None: + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot") + source = agent_as_eval_source( + agent, + hosted_agent_name="weather-bot-hosted-id", + hosted_agent_version="3", + ) + assert source.type == "agent" + assert source.agent_name == "weather-bot-hosted-id" + assert source.agent_version == "3" + + def test_auto_detects_hosted_foundry_agent(self) -> None: + """A chat_client carrying agent_name/agent_version is treated as a hosted agent.""" + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") + agent.chat_client = MagicMock() + agent.chat_client.agent_name = "weather-prompt-agent" + agent.chat_client.agent_version = "2" + + source = agent_as_eval_source(agent) + assert source.type == "agent" + assert source.agent_name == "weather-prompt-agent" + assert source.agent_version == "2" + assert source.prompt is None + assert source.description == "Looks up the weather." + + def test_auto_detection_handles_versionless_hosted_agent(self) -> None: + """HostedAgents typically omit agent_version (no None forwarded).""" + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot") + agent.chat_client = MagicMock() + agent.chat_client.agent_name = "weather-hosted-agent" + agent.chat_client.agent_version = None + + source = agent_as_eval_source(agent) + assert source.type == "agent" + assert source.agent_name == "weather-hosted-agent" + assert source.agent_version is None + + def test_force_prompt_source_overrides_auto_detection(self) -> None: + """force_prompt_source=True falls back to dossier even for hosted agents.""" + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") + agent.chat_client = MagicMock() + agent.chat_client.agent_name = "weather-prompt-agent" + agent.chat_client.agent_version = "2" + + source = agent_as_eval_source(agent, force_prompt_source=True) + assert source.type == "prompt" + assert source.prompt is not None + assert "Agent name: weather-bot" in source.prompt + + def test_auto_detection_ignores_non_string_chat_client_fields(self) -> None: + """Bare MagicMock chat_client (untyped attrs) must not trigger detection.""" + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="local-agent") + agent.chat_client = MagicMock() # agent_name attr resolves to a MagicMock, not a str + + source = agent_as_eval_source(agent) + assert source.type == "prompt" + assert source.prompt is not None + assert "Agent name: local-agent" in source.prompt + + def test_forwards_keyword_options_to_agent(self) -> None: + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent() + source = agent_as_eval_source(agent, include_instructions=False) + assert source.prompt is not None + assert "Instructions:" not in source.prompt + + +class TestFoundryWorkflowAsEvalSource: + """Tests for foundry's workflow_as_eval_source helper (wraps Workflow.as_eval_source).""" + + def _make_workflow(self) -> MagicMock: + from agent_framework._evaluation import _render_workflow_dossier + + workflow = MagicMock() + workflow.name = "demo-workflow" + workflow.description = "Routes user questions." + workflow.to_dict.return_value = { + "name": "demo-workflow", + "id": "wf_1", + "executors": {}, + "edge_groups": [], + } + workflow.executors = {} + workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier( + workflow, + include_instructions=kw.get("include_instructions", True), + include_tools=kw.get("include_tools", True), + include_context_providers=kw.get("include_context_providers", False), + include_examples=kw.get("include_examples", False), + examples=kw.get("examples"), + include_topology=kw.get("include_topology", True), + ) + return workflow + + def test_returns_prompt_source_with_topology(self) -> None: + from agent_framework_foundry._foundry_evals import workflow_as_eval_source + + workflow = self._make_workflow() + source = workflow_as_eval_source(workflow) + assert source.type == "prompt" + assert source.description == "Routes user questions." + assert source.prompt is not None + assert "Workflow name: demo-workflow" in source.prompt + assert "Topology (JSON):" in source.prompt + + def test_topology_can_be_disabled(self) -> None: + from agent_framework_foundry._foundry_evals import workflow_as_eval_source + + workflow = self._make_workflow() + source = workflow_as_eval_source(workflow, include_topology=False) + assert source.prompt is not None + assert "Topology (JSON):" not in source.prompt + + +class TestCoalesceGenerationSources: + """Validation for the source-resolution helper used by FoundryEvals.generate_rubric.""" + + def test_requires_exactly_one_source(self) -> None: + from agent_framework_foundry._foundry_evals import _coalesce_generation_sources + + with pytest.raises(ValueError, match="Provide one of"): + _coalesce_generation_sources(agent=None, workflow=None, sources=None) + + def test_rejects_multiple_sources(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _coalesce_generation_sources + + agent = MagicMock() + agent.name = "a" + agent.description = None + agent.default_options = {"instructions": "x", "tools": []} + agent.context_providers = [] + agent.mcp_tools = [] + with pytest.raises(ValueError, match="only one of"): + _coalesce_generation_sources( + agent=agent, + workflow=None, + sources=[EvalGenerationSource(type="prompt", prompt="hi")], + ) + + def test_uses_agent_helper_when_only_agent_supplied(self) -> None: + from agent_framework_foundry._foundry_evals import _coalesce_generation_sources + + agent = _make_stub_agent(name="alpha", description="An agent.") + + sources = _coalesce_generation_sources(agent=agent, workflow=None, sources=None) + assert len(sources) == 1 + assert sources[0].type == "prompt" + assert sources[0].prompt is not None + assert "Agent name: alpha" in sources[0].prompt + + def test_rejects_empty_sources_list(self) -> None: + from agent_framework_foundry._foundry_evals import _coalesce_generation_sources + + with pytest.raises(ValueError, match="at least one"): + _coalesce_generation_sources(agent=None, workflow=None, sources=[]) + + +class TestToSdkSource: + """Translation between EvalGenerationSource and SDK *JobSource types.""" + + def _make_sdk_types(self, *, with_agent: bool = True, with_dataset: bool = True, with_traces: bool = True) -> Any: + from agent_framework_foundry._foundry_evals import _GenerationSdkTypes + + return _GenerationSdkTypes( + EvaluatorGenerationInputs=MagicMock(), + EvaluatorGenerationJob=MagicMock(), + PromptSource=MagicMock(name="PromptSource"), + AgentSource=MagicMock(name="AgentSource") if with_agent else None, + DatasetSource=MagicMock(name="DatasetSource") if with_dataset else None, + TracesSource=MagicMock(name="TracesSource") if with_traces else None, + ) + + def test_prompt_source_is_translated(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + sdk.PromptSource.return_value = "prompt-sdk-instance" + out = _to_sdk_source( + EvalGenerationSource(type="prompt", prompt="hello", description="d"), + sdk, + ) + assert out == "prompt-sdk-instance" + sdk.PromptSource.assert_called_once_with(prompt="hello", description="d") + + def test_prompt_without_text_raises(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + with pytest.raises(ValueError, match="non-empty prompt"): + _to_sdk_source(EvalGenerationSource(type="prompt"), sdk) + + def test_agent_source_is_translated(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + sdk.AgentSource.return_value = "agent-sdk-instance" + out = _to_sdk_source( + EvalGenerationSource(type="agent", agent_name="my-hosted-agent"), + sdk, + ) + assert out == "agent-sdk-instance" + sdk.AgentSource.assert_called_once_with(agent_name="my-hosted-agent") + + def test_agent_source_requires_name(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + with pytest.raises(ValueError, match="agent_name"): + _to_sdk_source(EvalGenerationSource(type="agent"), sdk) + + def test_agent_source_raises_when_sdk_missing(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types(with_agent=False) + with pytest.raises(NotImplementedError, match="AgentEvaluatorGenerationJobSource"): + _to_sdk_source( + EvalGenerationSource(type="agent", agent_name="x"), + sdk, + ) + + def test_dataset_source_is_translated(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + sdk.DatasetSource.return_value = "dataset-sdk-instance" + out = _to_sdk_source( + EvalGenerationSource(type="dataset", dataset_name="ds", dataset_version="1"), + sdk, + ) + assert out == "dataset-sdk-instance" + sdk.DatasetSource.assert_called_once_with(name="ds", version="1") + + def test_agent_source_forwards_agent_version(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + sdk.AgentSource.return_value = "agent-sdk-instance" + out = _to_sdk_source( + EvalGenerationSource(type="agent", agent_name="prompt-agent", agent_version="2"), + sdk, + ) + assert out == "agent-sdk-instance" + sdk.AgentSource.assert_called_once_with(agent_name="prompt-agent", agent_version="2") + + +class TestPollGenerationJob: + """Behavior of the rubric-generation polling loop.""" + + async def test_returns_immediately_on_succeeded(self) -> None: + from agent_framework_foundry._foundry_evals import _poll_generation_job + + evaluators_ops = MagicMock() + evaluators_ops.get_generation_job = AsyncMock() + job = MagicMock(id="job_1", status="succeeded") + out = await _poll_generation_job(evaluators_ops, job, poll_interval=0.01, timeout=1.0) + assert out is job + evaluators_ops.get_generation_job.assert_not_called() + + async def test_polls_until_terminal(self) -> None: + from agent_framework_foundry._foundry_evals import _poll_generation_job + + running = MagicMock(id="job_1", status="running") + succeeded = MagicMock(id="job_1", status="succeeded") + evaluators_ops = MagicMock() + evaluators_ops.get_generation_job = AsyncMock(side_effect=[running, succeeded]) + + initial = MagicMock(id="job_1", status="running") + out = await _poll_generation_job(evaluators_ops, initial, poll_interval=0.001, timeout=1.0) + assert out is succeeded + assert evaluators_ops.get_generation_job.await_count == 2 + + async def test_failed_status_raises(self) -> None: + from agent_framework_foundry._foundry_evals import _poll_generation_job + + err = MagicMock(message="boom") + terminal = MagicMock(id="job_1", status="failed", error=err) + evaluators_ops = MagicMock() + evaluators_ops.get_generation_job = AsyncMock(return_value=terminal) + + with pytest.raises(RuntimeError, match="boom"): + await _poll_generation_job( + evaluators_ops, + MagicMock(id="job_1", status="running"), + poll_interval=0.001, + timeout=1.0, + ) + + async def test_timeout_raises(self) -> None: + from agent_framework_foundry._foundry_evals import _poll_generation_job + + running = MagicMock(id="job_1", status="running") + evaluators_ops = MagicMock() + evaluators_ops.get_generation_job = AsyncMock(return_value=running) + + with pytest.raises(TimeoutError): + await _poll_generation_job(evaluators_ops, running, poll_interval=0.001, timeout=0.005) + + +class TestGenerationJobToRef: + """Translation of a completed generation job to a GeneratedEvaluatorRef.""" + + def test_builds_pinned_ref_with_dimensions(self) -> None: + from agent_framework_foundry._foundry_evals import RubricDimension, _generation_job_to_ref + + dim = MagicMock(id="d1", description="dim", weight=2, always_applicable=True) + definition = MagicMock(dimensions=[dim], pass_threshold=0.75) + evaluator = MagicMock( + name="my-eval", + version=3, + display_name="My Eval", + description="A custom rubric.", + definition=definition, + ) + evaluator.name = "my-eval" + job = MagicMock(artifacts=MagicMock(evaluator=evaluator)) + + ref = _generation_job_to_ref(job, category="quality") + assert ref.name == "my-eval" + assert ref.version == "3" + assert ref.display_name == "My Eval" + assert ref.description == "A custom rubric." + assert ref.category == "quality" + assert ref.pass_threshold == 0.75 + assert ref.dimensions is not None + assert ref.dimensions[0] == RubricDimension(id="d1", description="dim", weight=2, always_applicable=True) + + def test_missing_artifacts_raises(self) -> None: + from agent_framework_foundry._foundry_evals import _generation_job_to_ref + + job = MagicMock(artifacts=None) + with pytest.raises(RuntimeError, match="evaluator artifact"): + _generation_job_to_ref(job, category="quality") + + +class TestGenerateRubricSdkMissing: + """generate_rubric raises NotImplementedError when SDK lacks the rubric APIs.""" + + async def test_raises_when_sdk_types_unavailable(self, monkeypatch: pytest.MonkeyPatch) -> None: + from agent_framework_foundry import _foundry_evals as fm + from agent_framework_foundry._foundry_evals import EvalGenerationSource + + def _raise() -> Any: + raise fm._RubricSdkUnavailableError(fm._RUBRIC_SDK_MISSING_MSG) + + monkeypatch.setattr(fm, "_import_generation_sdk_types", _raise) + + project_client = MagicMock() + + with pytest.raises(NotImplementedError, match="rubric"): + await FoundryEvals.generate_rubric( + project_client=project_client, + name="my-eval", + sources=[EvalGenerationSource(type="prompt", prompt="hi")], + ) + + async def test_raises_value_error_on_invalid_category(self) -> None: + """category outside {quality, safety} should fail fast at the boundary.""" + from agent_framework_foundry._foundry_evals import EvalGenerationSource + + project_client = MagicMock() + + with pytest.raises(ValueError, match="category"): + await FoundryEvals.generate_rubric( + project_client=project_client, + name="my-eval", + sources=[EvalGenerationSource(type="prompt", prompt="hi")], + category=cast("Any", "invalid"), + ) + + +class TestGenerateRubricE2E: + """End-to-end happy path for generate_rubric with mocked SDK.""" + + async def test_generate_rubric_from_agent(self, monkeypatch: pytest.MonkeyPatch) -> None: + from agent_framework_foundry import _foundry_evals as fm + + # Stub SDK type handles + prompt_cls = MagicMock(name="PromptSource") + prompt_cls.return_value = "sdk-prompt" + inputs_cls = MagicMock(name="EvaluatorGenerationInputs") + inputs_cls.return_value = "sdk-inputs" + job_cls = MagicMock(name="EvaluatorGenerationJob") + job_cls.return_value = "sdk-job" + + sdk_types = fm._GenerationSdkTypes( + EvaluatorGenerationInputs=inputs_cls, + EvaluatorGenerationJob=job_cls, + PromptSource=prompt_cls, + AgentSource=None, + DatasetSource=None, + TracesSource=None, + ) + monkeypatch.setattr(fm, "_import_generation_sdk_types", lambda: sdk_types) + + # Mock the SDK operations and completed job + completed_evaluator = MagicMock(version="7", display_name=None, description=None) + completed_evaluator.name = "agent-rubric" + completed_evaluator.definition = MagicMock(dimensions=[], pass_threshold=None) + completed = MagicMock( + id="job_42", + status="succeeded", + artifacts=MagicMock(evaluator=completed_evaluator), + ) + + evaluators_ops = MagicMock() + evaluators_ops.create_generation_job = AsyncMock(return_value=completed) + evaluators_ops.get_generation_job = AsyncMock(return_value=completed) + project_client = MagicMock() + project_client.beta = MagicMock(evaluators=evaluators_ops) + + # Build a stub agent + agent = _make_stub_agent( + name="weather-bot", + description="Looks up weather.", + instructions="Be brief.", + ) + + ref = await FoundryEvals.generate_rubric( + project_client=project_client, + name="agent-rubric", + agent=agent, + category="quality", + model="gpt-4o", + display_name="Display", + description="Desc", + operation_id="op-123", + ) + + assert ref.name == "agent-rubric" + assert ref.version == "7" + assert ref.category == "quality" + + # Verify inputs/job/source assembly + prompt_cls.assert_called_once() + prompt_kwargs = prompt_cls.call_args.kwargs + assert "Agent name: weather-bot" in prompt_kwargs["prompt"] + assert prompt_kwargs["description"] == "Looks up weather." + + inputs_cls.assert_called_once() + inputs_kwargs = inputs_cls.call_args.kwargs + assert inputs_kwargs["name"] == "agent-rubric" + assert inputs_kwargs["category"] == "quality" + assert inputs_kwargs["model"] == "gpt-4o" + assert inputs_kwargs["display_name"] == "Display" + assert inputs_kwargs["description"] == "Desc" + assert inputs_kwargs["sources"] == ["sdk-prompt"] + + job_cls.assert_called_once_with(inputs="sdk-inputs") + evaluators_ops.create_generation_job.assert_awaited_once_with(job="sdk-job", operation_id="op-123") diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py new file mode 100644 index 0000000000..9c19ff552b --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py @@ -0,0 +1,151 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Generate a Foundry rubric evaluator from an agent and use it in CI. + +This sample demonstrates the end-to-end adaptive-evals flow: + +1. Build an agent. +2. Generate a rubric evaluator from the agent using + ``FoundryEvals.generate_rubric()`` — produces a pinned + ``GeneratedEvaluatorRef`` you can store in source control. +3. Use the pinned reference in ``evaluators=[...]`` for a regression + run alongside built-in evaluators. +4. Assert quality gates with ``assert_score_at_least`` / + ``assert_dimension_score_at_least`` / ``assert_no_failed_items``. + +A companion ``evaluators.yaml`` shows the source-controlled config +pattern for CI. Load it with :func:`load_evals_config` and pass the +resulting spec through :func:`build_sources` to keep generation +parameters out of code. + +Prerequisites: +- An Azure AI Foundry project with a deployed model. +- ``azure-ai-projects`` build that includes the rubric-generation APIs. +- Set ``FOUNDRY_PROJECT_ENDPOINT`` and ``FOUNDRY_MODEL`` in ``.env``. + +Run with: + +.. code-block:: bash + + az login + python evaluate_with_generated_rubric_sample.py +""" + +import asyncio +import os +import textwrap +from pathlib import Path + +from agent_framework import evaluate_agent +from agent_framework.foundry import ( + FoundryChatClient, + FoundryEvals, + build_sources, + load_evals_config, +) +from azure.ai.projects.aio import AIProjectClient +from azure.identity.aio import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + + +def get_weather(location: str) -> str: + """Get the current weather for a location.""" + samples = { + "seattle": "62F, cloudy with a chance of rain", + "london": "55F, overcast", + "paris": "68F, partly sunny", + } + return samples.get(location.lower(), f"Weather data not available for {location}") + + +SAMPLE_YAML = textwrap.dedent( + """\ + evaluators: + travel-quality: + type: foundry.generated_rubric + category: quality + model: gpt-4o + display_name: Travel Quality Rubric + description: Custom rubric tailored to the travel-assistant agent. + sources: + - type: agent + include_instructions: true + include_tools: true + """ +) + + +async def main() -> None: + project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] + model_name = os.environ.get("FOUNDRY_MODEL", "gpt-4o") + + credential = AzureCliCredential() + chat_client = FoundryChatClient( + project_endpoint=project_endpoint, + model=model_name, + credential=credential, + ) + project_client = AIProjectClient(endpoint=project_endpoint, credential=credential) + + agent = chat_client.as_agent( + name="travel-assistant", + instructions=( + "You are a helpful travel assistant. Always ground recommendations in tool output, " + "cite each tool result, and refuse questions outside travel planning." + ), + tools=[get_weather], + ) + + # 1. Load the source-controlled evaluator config. + config_path = Path(__file__).with_name("evaluators.yaml") + if not config_path.exists(): + config_path.write_text(SAMPLE_YAML, encoding="utf-8") + print(f"Wrote sample config to {config_path}") + config = load_evals_config(config_path) + spec = config["travel-quality"] + + # 2. Generate (or refresh) the rubric evaluator. In CI you typically run + # this once and commit the returned name/version pair. + print("Generating rubric evaluator from agent + spec...") + sources = build_sources(spec, agent=agent) + rubric_ref = await FoundryEvals.generate_rubric( + project_client=project_client, + name=spec.name, + sources=sources, + category=spec.category, + model=spec.model, + display_name=spec.display_name, + description=spec.description, + ) + print(f"Generated rubric {rubric_ref.name}@{rubric_ref.version} with {len(rubric_ref.dimensions or ())} dimensions") + + # 3. Run an evaluation that combines built-ins with the new rubric. + evals = FoundryEvals( + client=chat_client, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY, rubric_ref], + ) + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather in Seattle?", + "Should I pack an umbrella for London?", + ], + evaluators=evals, + ) + + # 4. Quality gates — wire these into your CI job's exit status. + for r in results: + print(f"\nRun {r.run_id}: {r.passed}/{r.total} passed; portal: {r.report_url}") + r.assert_no_failed_items() + r.assert_score_at_least(0.8) + if rubric_ref.dimensions: + r.assert_dimension_score_at_least(rubric_ref.dimensions[0].id, 3) + + await project_client.close() + await credential.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml new file mode 100644 index 0000000000..f3e698c77c --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml @@ -0,0 +1,11 @@ +evaluators: + travel-quality: + type: foundry.generated_rubric + category: quality + model: gpt-4o + display_name: Travel Quality Rubric + description: Custom rubric tailored to the travel-assistant agent. + sources: + - type: agent + include_instructions: true + include_tools: true diff --git a/python/uv.lock b/python/uv.lock index 58c0ed50ee..dee89c9f0a 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -604,7 +604,7 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "agent-framework-core", editable = "packages/core" }, - { name = "github-copilot-sdk", marker = "python_full_version >= '3.11'", specifier = "<=1.0.0b2,>=1.0.0b2" }, + { name = "github-copilot-sdk", marker = "python_full_version >= '3.11'", specifier = ">=1.0.0b2,<=1.0.0b2" }, ] [[package]]