From e45b934cc219cbbf0b452d27d10815ac480cf917 Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Tue, 26 May 2026 17:24:02 -0700 Subject: [PATCH 01/16] Python: feat(evals): RubricScore type + EvalScoreResult.dimensions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../packages/core/agent_framework/__init__.py | 2 ++ .../core/agent_framework/_evaluation.py | 34 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/python/packages/core/agent_framework/__init__.py b/python/packages/core/agent_framework/__init__.py index 356051da3ff..52368df476b 100644 --- a/python/packages/core/agent_framework/__init__.py +++ b/python/packages/core/agent_framework/__init__.py @@ -70,6 +70,7 @@ Evaluator, ExpectedToolCall, LocalEvaluator, + RubricScore, evaluate_agent, evaluate_workflow, evaluator, @@ -425,6 +426,7 @@ "ResponseStream", "Role", "RoleLiteral", + "RubricScore", "RunContext", "Runner", "RunnerContext", diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py index 64fab0eacb6..32ae5bcfba4 100644 --- a/python/packages/core/agent_framework/_evaluation.py +++ b/python/packages/core/agent_framework/_evaluation.py @@ -311,12 +311,15 @@ class EvalScoreResult: score: Numeric score from the evaluator. passed: Whether the item passed this evaluator's threshold. sample: Optional raw evaluator output (rationale, metadata). + dimensions: Per-dimension scores for rubric-based evaluators. + ``None`` for non-rubric (e.g. built-in) evaluators. """ name: str score: float passed: bool | None = None sample: dict[str, Any] | None = None + dimensions: list[RubricScore] | None = None @experimental(feature_id=ExperimentalFeature.EVALS) @@ -496,6 +499,37 @@ def raise_for_status(self, msg: str | None = None) -> None: detail += f" Errored items: {', '.join(summaries)}." raise EvalNotPassedError(detail) +# endregion + +# region Generated rubric evaluators + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class RubricScore: + """A single dimension's score from a rubric-based evaluator run. + + Rubric evaluators (e.g. Foundry's generated rubric evaluators) emit + one ``RubricScore`` per dimension per item. Attached to + :class:`EvalScoreResult` as a typed view of the raw + ``properties.rubric_scores`` payload. + + Attributes: + id: Stable identifier for the dimension (e.g. + ``"policy_enforcement"``) defined by the rubric. + score: Numeric score, or ``None`` when the dimension was marked + non-applicable for this item. + applicable: Whether the dimension applied to this item. + weight: Dimension weight (mirrors the rubric definition). + reason: Short rationale produced by the evaluator. + """ + + id: str + score: int | None + applicable: bool + weight: int + reason: str + # endregion From e5830dd7fcdcabfb75ce2ca99d47fdb40fa9ef61 Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Tue, 26 May 2026 17:31:35 -0700 Subject: [PATCH 02/16] Python: feat(foundry-evals): RubricDimension + GeneratedEvaluatorRef + accept in evaluators= Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../agent_framework_foundry/__init__.py | 4 + .../agent_framework_foundry/_foundry_evals.py | 192 ++++++++++++++++-- .../foundry/tests/test_foundry_evals.py | 104 ++++++++++ python/uv.lock | 2 +- 4 files changed, 280 insertions(+), 22 deletions(-) diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py index 002e63f8a6b..14eebfaffa0 100644 --- a/python/packages/foundry/agent_framework_foundry/__init__.py +++ b/python/packages/foundry/agent_framework_foundry/__init__.py @@ -12,6 +12,8 @@ ) from ._foundry_evals import ( FoundryEvals, + GeneratedEvaluatorRef, + RubricDimension, evaluate_foundry_target, evaluate_traces, ) @@ -32,10 +34,12 @@ "FoundryEmbeddingSettings", "FoundryEvals", "FoundryMemoryProvider", + "GeneratedEvaluatorRef", "RawFoundryAgent", "RawFoundryAgentChatClient", "RawFoundryChatClient", "RawFoundryEmbeddingClient", + "RubricDimension", "__version__", "evaluate_foundry_target", "evaluate_traces", diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index eef58b0a040..9cfcc4bc678 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -29,7 +29,8 @@ import asyncio import logging from collections.abc import Sequence -from typing import TYPE_CHECKING, Any +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Literal from agent_framework._evaluation import ( AgentEvalConverter, @@ -51,6 +52,107 @@ logger = logging.getLogger(__name__) + +# region Generated rubric evaluator types + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class RubricDimension: + """A single dimension of a Foundry generated rubric evaluator. + + Rubric evaluators score each item along one or more named dimensions, + each with its own description and weight. Foundry's evaluator + generation pipeline produces these dimensions from agent/workflow + metadata; agent-framework surfaces them so callers can inspect a + generated evaluator's structure without round-tripping through the + portal. + + Attributes: + id: Stable identifier for the dimension (e.g. ``"policy_enforcement"``). + description: Natural-language description of what the dimension scores. + weight: Integer weight controlling the dimension's contribution to + the aggregate score. + always_applicable: When ``False``, evaluators may mark this + dimension non-applicable on a per-item basis. + """ + + id: str + description: str + weight: int + always_applicable: bool = False + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class GeneratedEvaluatorRef: + """A reference to a generated rubric evaluator stored in Foundry. + + Pass instances of this class to :class:`FoundryEvals` to score items + with a previously generated rubric evaluator. Construct directly + when the evaluator already exists, or obtain one from + :meth:`FoundryEvals.generate_rubric`. + + By default ``version`` is required and pinned so an evaluation run is + reproducible. Use :meth:`latest` to opt in to versionless references + explicitly. + + Attributes: + name: Evaluator name as stored in the Foundry project (e.g. + ``"my-policy-evaluator"``). Distinct from built-in + evaluators such as ``"builtin.relevance"``. + version: Pinned evaluator version. ``None`` means "latest" — + this is discouraged for CI/repro and ``FoundryEvals`` will + emit a warning when used. + category: ``"quality"`` for ungrounded rubric scoring, + ``"safety"`` for safety-focused evaluators. Matches the + Foundry evaluator's declared category. + display_name: Optional human-readable name used in result + summaries. Defaults to ``name`` when unset. + description: Optional description carried over from the + generated evaluator definition for documentation. + dimensions: Optional snapshot of the rubric's dimensions for + inspection. Not required to invoke the evaluator — the + service uses the persisted definition. + pass_threshold: Optional aggregate score threshold (0.0-1.0) the + evaluator considers a passing item. ``None`` defers to the + evaluator's stored default. + """ + + name: str + version: str | None = None + category: Literal["quality", "safety"] = "quality" + display_name: str | None = None + description: str | None = None + dimensions: tuple[RubricDimension, ...] | None = None + pass_threshold: float | None = None + + @classmethod + def latest( + cls, + name: str, + *, + category: Literal["quality", "safety"] = "quality", + display_name: str | None = None, + description: str | None = None, + ) -> GeneratedEvaluatorRef: + """Construct a versionless reference (resolves to the latest version at run time). + + Discouraged for reproducible runs. Prefer the constructor with + an explicit ``version`` so CI and replay evaluations stay stable + when the evaluator is regenerated. + """ + return cls( + name=name, + version=None, + category=category, + display_name=display_name, + description=description, + ) + + +# endregion + # Agent evaluators that accept query/response as conversation arrays. # Maintained manually — check https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk # for the latest evaluator list. These are the evaluators that need conversation-format input. @@ -166,7 +268,7 @@ def _resolve_evaluator(name: str) -> str: def _build_testing_criteria( - evaluators: Sequence[str], + evaluators: Sequence[str | GeneratedEvaluatorRef], model: str, *, include_data_mapping: bool = False, @@ -175,7 +277,9 @@ def _build_testing_criteria( """Build ``testing_criteria`` for ``evals.create()``. Args: - evaluators: Evaluator names. + evaluators: Evaluator names (built-in shorts / fully-qualified + ``builtin.*`` names) or :class:`GeneratedEvaluatorRef` + instances for generated rubric evaluators. model: Model deployment for the LLM judge. include_data_mapping: Whether to include field-level data mapping (required for the JSONL data source, not needed for response-based). @@ -183,7 +287,36 @@ def _build_testing_criteria( definitions. """ criteria: list[dict[str, Any]] = [] - for name in evaluators: + for entry_spec in evaluators: + if isinstance(entry_spec, GeneratedEvaluatorRef): + short = entry_spec.display_name or entry_spec.name + ref_entry: dict[str, Any] = { + "type": "azure_ai_evaluator", + "name": short, + "evaluator_name": entry_spec.name, + "initialization_parameters": {"deployment_name": model}, + } + if entry_spec.version is not None: + ref_entry["evaluator_version"] = entry_spec.version + else: + logger.warning( + "GeneratedEvaluatorRef '%s' has no pinned version; the eval run " + "will resolve to whichever version is current at execution time. " + "Pin the version for reproducible runs.", + entry_spec.name, + ) + if include_data_mapping: + ref_mapping: dict[str, str] = { + "query": "{{item.query_messages}}", + "response": "{{item.response_messages}}", + } + if include_tool_definitions: + ref_mapping["tool_definitions"] = "{{item.tool_definitions}}" + ref_entry["data_mapping"] = ref_mapping + criteria.append(ref_entry) + continue + + name = entry_spec qualified = _resolve_evaluator(name) short = name if not name.startswith("builtin.") else name.split(".")[-1] @@ -247,9 +380,9 @@ def _build_item_schema( def _resolve_default_evaluators( - evaluators: Sequence[str] | None, + evaluators: Sequence[str | GeneratedEvaluatorRef] | None, items: Sequence[EvalItem | dict[str, Any]] | None = None, -) -> list[str]: +) -> list[str | GeneratedEvaluatorRef]: """Resolve evaluators, applying defaults when ``None``. Defaults to relevance + coherence + task_adherence. Automatically adds @@ -258,7 +391,7 @@ def _resolve_default_evaluators( if evaluators is not None: return list(evaluators) - result = list(_DEFAULT_EVALUATORS) + result: list[str | GeneratedEvaluatorRef] = list(_DEFAULT_EVALUATORS) if items is not None: has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items) if has_tools: @@ -267,14 +400,24 @@ def _resolve_default_evaluators( def _filter_tool_evaluators( - evaluators: list[str], + evaluators: list[str | GeneratedEvaluatorRef], items: Sequence[EvalItem | dict[str, Any]], -) -> list[str]: - """Remove tool evaluators if no items have tool definitions.""" +) -> list[str | GeneratedEvaluatorRef]: + """Remove tool evaluators if no items have tool definitions. + + Generated rubric evaluators are tool-aware but not tool-required; they + are preserved regardless of whether items carry tool definitions. + """ has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items) if has_tools: return evaluators - filtered = [e for e in evaluators if _resolve_evaluator(e) not in _TOOL_EVALUATORS] + + def _is_tool_only(spec: str | GeneratedEvaluatorRef) -> bool: + if isinstance(spec, GeneratedEvaluatorRef): + return False + return _resolve_evaluator(spec) in _TOOL_EVALUATORS + + filtered = [e for e in evaluators if not _is_tool_only(e)] if not filtered: raise ValueError( f"All requested evaluators {evaluators} require tool definitions, " @@ -282,7 +425,7 @@ def _filter_tool_evaluators( "or choose evaluators that do not require tools." ) if len(filtered) < len(evaluators): - removed = [e for e in evaluators if _resolve_evaluator(e) in _TOOL_EVALUATORS] + removed = [e for e in evaluators if _is_tool_only(e)] logger.info("Removed tool evaluators %s (no items have tools)", removed) return filtered @@ -472,7 +615,7 @@ async def _evaluate_via_responses_impl( *, client: AsyncOpenAI, response_ids: Sequence[str], - evaluators: list[str], + evaluators: list[str | GeneratedEvaluatorRef], model: str, eval_name: str, poll_interval: float, @@ -573,8 +716,11 @@ class FoundryEvals: (from ``azure.ai.projects.aio``). Provide this or *client*. model: Model deployment name for the evaluator LLM judge. Resolved from ``client.model`` when omitted. - evaluators: Evaluator names (e.g. ``["relevance", "tool_call_accuracy"]``). - When ``None`` (default), uses smart defaults based on item data. + evaluators: Evaluator specifications. Entries may be built-in + short names (e.g. ``"relevance"``), fully-qualified + ``"builtin.*"`` names, or :class:`GeneratedEvaluatorRef` + instances for previously generated rubric evaluators. When + ``None`` (default), uses smart defaults based on item data. conversation_split: How to split multi-turn conversations into query/response halves. Defaults to ``LAST_TURN``. Pass a ``ConversationSplit`` enum value or a custom callable — see @@ -623,7 +769,7 @@ def __init__( client: FoundryChatClient | None = None, project_client: AIProjectClient | None = None, model: str | None = None, - evaluators: Sequence[str] | None = None, + evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None, conversation_split: ConversationSplitter = ConversationSplit.LAST_TURN, poll_interval: float = 5.0, timeout: float = 180.0, @@ -642,7 +788,9 @@ def __init__( "Model is required. Pass model= explicitly or use a FoundryChatClient that has a model configured." ) self._model = resolved_model - self._evaluators = list(evaluators) if evaluators is not None else None + self._evaluators: list[str | GeneratedEvaluatorRef] | None = ( + list(evaluators) if evaluators is not None else None + ) self._conversation_split = conversation_split self._poll_interval = poll_interval self._timeout = timeout @@ -678,7 +826,7 @@ async def evaluate( async def _evaluate_via_dataset( self, items: Sequence[EvalItem], - evaluators: list[str], + evaluators: list[str | GeneratedEvaluatorRef], eval_name: str, ) -> EvalResults: """Evaluate using JSONL dataset upload path.""" @@ -761,7 +909,7 @@ async def _evaluate_via_dataset( @experimental(feature_id=ExperimentalFeature.EVALS) async def evaluate_traces( *, - evaluators: Sequence[str] | None = None, + evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None, client: FoundryChatClient | None = None, project_client: AIProjectClient | None = None, model: str, @@ -854,7 +1002,7 @@ async def evaluate_foundry_target( *, target: dict[str, Any], test_queries: Sequence[str], - evaluators: Sequence[str] | None = None, + evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None, client: FoundryChatClient | None = None, project_client: AIProjectClient | None = None, model: str, @@ -870,7 +1018,9 @@ async def evaluate_foundry_target( Args: target: Target configuration dict. test_queries: Queries for Foundry to send to the target. - evaluators: Evaluator names. + evaluators: Evaluator names (built-in shorts / fully-qualified + ``builtin.*`` names) or :class:`GeneratedEvaluatorRef` + instances for generated rubric evaluators. client: A ``FoundryChatClient`` instance. Provide this or *project_client*. project_client: An ``AIProjectClient`` instance. model: Model deployment name for the evaluator LLM judge. diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index a5d9f2e8642..f1ba5c86153 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -807,6 +807,79 @@ def test_all_tool_evaluators_include_tool_definitions(self) -> None: assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions" +# --------------------------------------------------------------------------- +# _build_item_schema +# --------------------------------------------------------------------------- + + + def test_generated_evaluator_ref_pinned_version(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="my-rubric", version="1") + criteria = _build_testing_criteria([ref], "gpt-4o", include_data_mapping=True) + + assert len(criteria) == 1 + c = criteria[0] + assert c["type"] == "azure_ai_evaluator" + assert c["evaluator_name"] == "my-rubric" + assert c["evaluator_version"] == "1" + assert c["name"] == "my-rubric" + assert c["initialization_parameters"] == {"deployment_name": "gpt-4o"} + assert c["data_mapping"] == { + "query": "{{item.query_messages}}", + "response": "{{item.response_messages}}", + } + + def test_generated_evaluator_ref_display_name_used_as_short(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="my-rubric", version="2", display_name="My Rubric") + criteria = _build_testing_criteria([ref], "gpt-4o") + + assert criteria[0]["name"] == "My Rubric" + assert criteria[0]["evaluator_name"] == "my-rubric" + + def test_generated_evaluator_ref_tool_definitions_added(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="my-rubric", version="1") + criteria = _build_testing_criteria( + [ref], + "gpt-4o", + include_data_mapping=True, + include_tool_definitions=True, + ) + + assert criteria[0]["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}" + + def test_generated_evaluator_ref_unpinned_warns(self, caplog: pytest.LogCaptureFixture) -> None: + import logging + + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef.latest("my-rubric") + with caplog.at_level(logging.WARNING, logger="agent_framework_foundry._foundry_evals"): + criteria = _build_testing_criteria([ref], "gpt-4o") + + assert "evaluator_version" not in criteria[0] + assert any("no pinned version" in r.message for r in caplog.records) + + def test_generated_evaluator_ref_mixed_with_builtins(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="my-rubric", version="1") + criteria = _build_testing_criteria( + ["relevance", ref, "task_adherence"], + "gpt-4o", + include_data_mapping=True, + ) + + assert [c["name"] for c in criteria] == ["relevance", "my-rubric", "task_adherence"] + assert criteria[0]["evaluator_name"] == "builtin.relevance" + assert criteria[1]["evaluator_name"] == "my-rubric" + assert criteria[2]["evaluator_name"] == "builtin.task_adherence" + + # --------------------------------------------------------------------------- # _build_item_schema # --------------------------------------------------------------------------- @@ -1264,6 +1337,37 @@ def test_raises_when_all_filtered(self) -> None: ) +# --------------------------------------------------------------------------- +# EvalResults +# --------------------------------------------------------------------------- + + + def test_preserves_generated_ref_when_no_tools(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="rubric", version="1") + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]), + ] + result = _filter_tool_evaluators( + ["relevance", ref, "tool_call_accuracy"], + items, + ) + assert "relevance" in result + assert ref in result + assert "tool_call_accuracy" not in result + + def test_generated_ref_alone_does_not_raise(self) -> None: + from agent_framework_foundry import GeneratedEvaluatorRef + + ref = GeneratedEvaluatorRef(name="rubric", version="1") + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]), + ] + result = _filter_tool_evaluators([ref], items) + assert result == [ref] + + # --------------------------------------------------------------------------- # EvalResults # --------------------------------------------------------------------------- diff --git a/python/uv.lock b/python/uv.lock index 58c0ed50ee5..dee89c9f0a0 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -604,7 +604,7 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "agent-framework-core", editable = "packages/core" }, - { name = "github-copilot-sdk", marker = "python_full_version >= '3.11'", specifier = "<=1.0.0b2,>=1.0.0b2" }, + { name = "github-copilot-sdk", marker = "python_full_version >= '3.11'", specifier = ">=1.0.0b2,<=1.0.0b2" }, ] [[package]] From 4bc60462d91361c0c792c724afd8b3d20cb115c9 Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Tue, 26 May 2026 17:31:48 -0700 Subject: [PATCH 03/16] Python: feat(evals): parse rubric_scores from output items + assertion helpers Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../core/agent_framework/_evaluation.py | 145 ++++++++++++++++++ .../core/tests/core/test_local_eval.py | 101 ++++++++++++ .../agent_framework_foundry/_foundry_evals.py | 82 +++++++++- .../foundry/tests/test_foundry_evals.py | 100 ++++++++++++ 4 files changed, 426 insertions(+), 2 deletions(-) diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py index 32ae5bcfba4..9eb8c4393df 100644 --- a/python/packages/core/agent_framework/_evaluation.py +++ b/python/packages/core/agent_framework/_evaluation.py @@ -499,6 +499,151 @@ def raise_for_status(self, msg: str | None = None) -> None: detail += f" Errored items: {', '.join(summaries)}." raise EvalNotPassedError(detail) + def assert_score_at_least( + self, + min_score: float, + *, + evaluator: str | None = None, + msg: str | None = None, + ) -> None: + """Assert every item's score (optionally filtered by evaluator) is ``>= min_score``. + + Designed for CI gates on generated rubric evaluators (e.g. + ``results.assert_score_at_least(0.80)``). Includes any + sub-results from workflow evaluations. + + Args: + min_score: Minimum acceptable score (inclusive). + evaluator: When set, only check scores from the evaluator + whose ``EvalScoreResult.name`` matches. + msg: Optional custom failure message. + + Raises: + EvalNotPassedError: When any matching score is below the threshold. + """ + offenders: list[str] = [] + + def _check(results: EvalResults) -> None: + for item in results.items: + for score in item.scores: + if evaluator is not None and score.name != evaluator: + continue + if score.score < min_score: + offenders.append(f"{item.item_id}/{score.name}={score.score:.3f}") + for sub in results.sub_results.values(): + _check(sub) + + _check(self) + if offenders: + detail = msg or ( + f"{len(offenders)} score(s) below threshold {min_score}" + f"{' for ' + evaluator if evaluator else ''}: {', '.join(offenders[:5])}" + + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "") + ) + raise EvalNotPassedError(detail) + + def assert_dimension_score_at_least( + self, + dimension_id: str, + min_score: float, + *, + evaluator: str | None = None, + require_applicable: bool = False, + msg: str | None = None, + ) -> None: + """Assert every item's score for a rubric *dimension* is ``>= min_score``. + + Walks ``EvalScoreResult.dimensions`` looking for the named + dimension across all items (and sub-results). Non-applicable + dimensions are skipped by default; pass + ``require_applicable=True`` to fail when no applicable score is + produced. + + Args: + dimension_id: Dimension id (matches :attr:`RubricDimension.id`). + min_score: Minimum acceptable dimension score (inclusive). + evaluator: When set, only consider scores from the evaluator + whose ``EvalScoreResult.name`` matches. + require_applicable: When ``True``, missing or non-applicable + dimension scores raise. Defaults to ``False`` (skip). + msg: Optional custom failure message. + + Raises: + EvalNotPassedError: When the dimension fails the threshold. + """ + offenders: list[str] = [] + missing_items: list[str] = [] + + def _check(results: EvalResults) -> None: + for item in results.items: + found_applicable = False + found_any = False + for score in item.scores: + if evaluator is not None and score.name != evaluator: + continue + if not score.dimensions: + continue + for rs in score.dimensions: + if rs.id != dimension_id: + continue + found_any = True + if not rs.applicable: + continue + found_applicable = True + if rs.score is None or rs.score < min_score: + offenders.append( + f"{item.item_id}/{score.name}/{dimension_id}=" + f"{rs.score if rs.score is not None else 'None'}" + ) + if require_applicable and not found_applicable and (not evaluator or found_any): + missing_items.append(item.item_id) + for sub in results.sub_results.values(): + _check(sub) + + _check(self) + problems: list[str] = [] + if offenders: + problems.append( + f"{len(offenders)} dimension score(s) for '{dimension_id}' below {min_score}: " + f"{', '.join(offenders[:5])}" + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "") + ) + if missing_items: + problems.append( + f"Dimension '{dimension_id}' not applicable on {len(missing_items)} item(s): " + f"{', '.join(missing_items[:5])}" + ) + if problems: + raise EvalNotPassedError(msg or "; ".join(problems)) + + def assert_no_failed_items(self, msg: str | None = None) -> None: + """Assert no item ended in ``fail`` or ``error`` status. + + Includes any sub-results from workflow evaluations. + + Args: + msg: Optional custom failure message. + + Raises: + EvalNotPassedError: When any item failed or errored. + """ + bad: list[str] = [] + + def _check(results: EvalResults) -> None: + for item in results.items: + if item.is_failed or item.is_error: + bad.append(f"{item.item_id}:{item.status}") + for sub in results.sub_results.values(): + _check(sub) + + _check(self) + if bad: + detail = msg or ( + f"{len(bad)} item(s) failed or errored: {', '.join(bad[:5])}" + + (f" (+{len(bad) - 5} more)" if len(bad) > 5 else "") + ) + raise EvalNotPassedError(detail) + + # endregion # region Generated rubric evaluators diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py index 96b0e1a3915..27c413b7151 100644 --- a/python/packages/core/tests/core/test_local_eval.py +++ b/python/packages/core/tests/core/test_local_eval.py @@ -5,6 +5,7 @@ from __future__ import annotations import inspect +from typing import Any import pytest @@ -1026,3 +1027,103 @@ def test_returns_none_for_empty_outputs(self): mock_result.get_outputs.return_value = [] item = _build_overall_item("Hello", mock_result) assert item is None + + +class TestRubricAssertions: + """Tests for EvalResults rubric assertion helpers.""" + + def _build_results(self, item_scores: list[list[tuple[str, float, list[Any] | None]]]) -> Any: + from agent_framework._evaluation import EvalItemResult, EvalResults, EvalScoreResult + + items: list[EvalItemResult] = [] + for i, scores in enumerate(item_scores): + items.append( + EvalItemResult( + item_id=f"oi_{i}", + status="pass", + scores=[EvalScoreResult(name=name, score=score, dimensions=dims) for name, score, dims in scores], + ) + ) + return EvalResults( + provider="Local", + status="completed", + result_counts={"passed": len(items), "failed": 0, "errored": 0}, + items=items, + ) + + def test_assert_score_at_least_passes(self): + results = self._build_results([[("relevance", 0.9, None)], [("relevance", 0.85, None)]]) + results.assert_score_at_least(0.8) + + def test_assert_score_at_least_raises(self): + from agent_framework._evaluation import EvalNotPassedError + + results = self._build_results([[("relevance", 0.9, None)], [("relevance", 0.5, None)]]) + with pytest.raises(EvalNotPassedError, match="below threshold"): + results.assert_score_at_least(0.8) + + def test_assert_score_at_least_filtered_by_evaluator(self): + from agent_framework._evaluation import EvalNotPassedError + + results = self._build_results([[("relevance", 0.9, None), ("coherence", 0.3, None)]]) + # Coherence is low — only fails when not filtered out. + results.assert_score_at_least(0.8, evaluator="relevance") + with pytest.raises(EvalNotPassedError): + results.assert_score_at_least(0.8, evaluator="coherence") + + def test_assert_dimension_score_at_least(self): + from agent_framework._evaluation import EvalNotPassedError, RubricScore + + dims_pass = [ + RubricScore(id="policy", score=4, applicable=True, weight=1, reason="ok"), + RubricScore(id="safety", score=5, applicable=True, weight=1, reason="ok"), + ] + dims_fail = [ + RubricScore(id="policy", score=2, applicable=True, weight=1, reason="bad"), + ] + results = self._build_results([[("rubric", 0.9, dims_pass)], [("rubric", 0.5, dims_fail)]]) + # Safety passes everywhere — no raise. + results.assert_dimension_score_at_least("safety", 4) + # Policy fails on the second item. + with pytest.raises(EvalNotPassedError, match="policy"): + results.assert_dimension_score_at_least("policy", 3) + + def test_assert_dimension_skips_non_applicable_by_default(self): + from agent_framework._evaluation import RubricScore + + dims = [ + RubricScore(id="optional", score=None, applicable=False, weight=1, reason="n/a"), + ] + results = self._build_results([[("rubric", 0.9, dims)]]) + # No applicable scores — should not raise. + results.assert_dimension_score_at_least("optional", 3) + + def test_assert_dimension_require_applicable_raises(self): + from agent_framework._evaluation import EvalNotPassedError, RubricScore + + dims = [ + RubricScore(id="optional", score=None, applicable=False, weight=1, reason="n/a"), + ] + results = self._build_results([[("rubric", 0.9, dims)]]) + with pytest.raises(EvalNotPassedError, match="not applicable"): + results.assert_dimension_score_at_least("optional", 3, require_applicable=True) + + def test_assert_no_failed_items(self): + from agent_framework._evaluation import EvalItemResult, EvalNotPassedError, EvalResults + + results = EvalResults( + provider="Local", + status="completed", + result_counts={"passed": 1, "failed": 1, "errored": 0}, + items=[ + EvalItemResult(item_id="oi_pass", status="pass"), + EvalItemResult(item_id="oi_fail", status="fail"), + ], + ) + with pytest.raises(EvalNotPassedError, match="failed"): + results.assert_no_failed_items() + + +# --------------------------------------------------------------------------- +# r5 review: _build_overall_item with empty outputs +# --------------------------------------------------------------------------- diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index 9cfcc4bc678..b6e3530654c 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -30,7 +30,7 @@ import logging from collections.abc import Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, cast from agent_framework._evaluation import ( AgentEvalConverter, @@ -40,6 +40,7 @@ EvalItemResult, EvalResults, EvalScoreResult, + RubricScore, ) from agent_framework._feature_stage import ExperimentalFeature, experimental from openai import AsyncOpenAI @@ -497,6 +498,80 @@ def _extract_per_evaluator(run: RunRetrieveResponse) -> dict[str, dict[str, int] return per_eval + +def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None: + """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload. + + Foundry rubric evaluators include a per-dimension breakdown under + ``properties.rubric_scores`` on each result. The exact location may + vary across SDK versions, so this helper accepts a few shapes: + + * The SDK ``sample`` object exposes ``properties.rubric_scores``. + * The ``sample`` is a dict containing ``properties.rubric_scores``. + * The ``sample`` is a dict with ``rubric_scores`` at the top level. + + Returns ``None`` when no rubric scores are present (i.e. the + evaluator was not a rubric evaluator). + """ + if sample is None: + return None + + raw: Any = None + properties: Any = getattr(sample, "properties", None) + if properties is not None: + raw = getattr(properties, "rubric_scores", None) + if raw is None and isinstance(properties, dict): + raw = cast("dict[str, Any]", properties).get("rubric_scores") + if raw is None and isinstance(sample, dict): + sample_any = cast("dict[str, Any]", sample) + props_dict: Any = sample_any.get("properties") + if isinstance(props_dict, dict): + raw = cast("dict[str, Any]", props_dict).get("rubric_scores") + if raw is None: + raw = sample_any.get("rubric_scores") + + if not raw: + return None + + parsed: list[RubricScore] = [] + raw_iter: Any = raw + for raw_entry in raw_iter: + entry: Any = raw_entry + try: + rid: Any + score_val: Any + applicable: Any + weight: Any + reason: Any + if isinstance(entry, dict): + entry_any = cast("dict[str, Any]", entry) + rid = entry_any.get("id") + score_val = entry_any.get("score") + applicable = entry_any.get("applicable") + weight = entry_any.get("weight") + reason = entry_any.get("reason", "") + else: + rid = getattr(entry, "id", None) + score_val = getattr(entry, "score", None) + applicable = getattr(entry, "applicable", None) + weight = getattr(entry, "weight", None) + reason = getattr(entry, "reason", "") or "" + if rid is None or weight is None or applicable is None: + continue + parsed.append( + RubricScore( + id=str(rid), + score=int(score_val) if isinstance(score_val, (int, float)) else None, + applicable=bool(applicable), + weight=int(weight), + reason=str(reason) if reason is not None else "", + ) + ) + except (TypeError, ValueError): + logger.debug("Skipping malformed rubric_scores entry: %s", cast("Any", entry), exc_info=True) + return parsed or None + + async def _fetch_output_items( client: AsyncOpenAI, eval_id: str, @@ -520,12 +595,15 @@ async def _fetch_output_items( # Extract per-evaluator scores scores: list[EvalScoreResult] = [] for r in oi.results or []: + sample = r.sample + dimensions = _extract_rubric_scores(sample) scores.append( EvalScoreResult( name=r.name, score=r.score, passed=r.passed, - sample=r.sample, + sample=sample, + dimensions=dimensions, ) ) diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index f1ba5c86153..7502726d1ad 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -2474,6 +2474,106 @@ async def test_handles_api_failure_gracefully(self) -> None: assert items == [] +# --------------------------------------------------------------------------- +# _poll_eval_run — timeout / failed / canceled paths +# --------------------------------------------------------------------------- + + + async def test_extracts_rubric_scores_from_dict_sample(self) -> None: + from agent_framework_foundry._foundry_evals import _fetch_output_items + + mock_result = MagicMock() + mock_result.name = "my-rubric" + mock_result.score = 0.85 + mock_result.passed = True + mock_result.sample = { + "properties": { + "rubric_scores": [ + {"id": "policy", "score": 4, "applicable": True, "weight": 1, "reason": "ok"}, + {"id": "safety", "score": None, "applicable": False, "weight": 1, "reason": "n/a"}, + ] + } + } + + mock_oi = MagicMock() + mock_oi.id = "oi_1" + mock_oi.status = "pass" + mock_oi.results = [mock_result] + mock_oi.sample = None + mock_oi.datasource_item = {} + + mock_client = MagicMock() + mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi])) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + + assert len(items) == 1 + scores = items[0].scores + assert len(scores) == 1 + assert scores[0].dimensions is not None + assert len(scores[0].dimensions) == 2 + policy = next(d for d in scores[0].dimensions if d.id == "policy") + assert policy.score == 4 + assert policy.applicable is True + assert policy.weight == 1 + assert policy.reason == "ok" + safety = next(d for d in scores[0].dimensions if d.id == "safety") + assert safety.score is None + assert safety.applicable is False + +class TestExtractRubricScores: + def test_handles_attribute_style_properties(self) -> None: + from agent_framework_foundry._foundry_evals import _extract_rubric_scores + + rs = MagicMock() + rs.id = "policy" + rs.score = 5 + rs.applicable = True + rs.weight = 2 + rs.reason = "ok" + + sample = MagicMock() + sample.properties = MagicMock() + sample.properties.rubric_scores = [rs] + + result = _extract_rubric_scores(sample) + assert result is not None + assert result[0].id == "policy" + assert result[0].score == 5 + assert result[0].weight == 2 + + def test_top_level_rubric_scores_in_dict(self) -> None: + from agent_framework_foundry._foundry_evals import _extract_rubric_scores + + sample = {"rubric_scores": [{"id": "a", "score": 3, "applicable": True, "weight": 1, "reason": "r"}]} + result = _extract_rubric_scores(sample) + assert result is not None + assert result[0].id == "a" + + def test_returns_none_when_missing(self) -> None: + from agent_framework_foundry._foundry_evals import _extract_rubric_scores + + assert _extract_rubric_scores(None) is None + assert _extract_rubric_scores({}) is None + assert _extract_rubric_scores({"properties": {}}) is None + + def test_skips_malformed_entries(self) -> None: + from agent_framework_foundry._foundry_evals import _extract_rubric_scores + + sample = { + "properties": { + "rubric_scores": [ + {"id": "good", "score": 3, "applicable": True, "weight": 1, "reason": "ok"}, + {"id": "bad-no-weight", "score": 2, "applicable": True, "reason": "x"}, + ] + } + } + result = _extract_rubric_scores(sample) + assert result is not None + assert len(result) == 1 + assert result[0].id == "good" + + # --------------------------------------------------------------------------- # _poll_eval_run — timeout / failed / canceled paths # --------------------------------------------------------------------------- From 38d51d13b14577a1ff671b4e14cfdeed4f03aefb Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Tue, 26 May 2026 17:34:04 -0700 Subject: [PATCH 04/16] Python: feat(evals): BaseAgent.as_eval_source / Workflow.as_eval_source Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../packages/core/agent_framework/_agents.py | 43 +++ .../core/agent_framework/_evaluation.py | 151 +++++++++- .../agent_framework/_workflows/_workflow.py | 49 +++ .../core/tests/core/test_local_eval.py | 284 +++++++++++------- 4 files changed, 416 insertions(+), 111 deletions(-) diff --git a/python/packages/core/agent_framework/_agents.py b/python/packages/core/agent_framework/_agents.py index 585898ae523..65506cadc6f 100644 --- a/python/packages/core/agent_framework/_agents.py +++ b/python/packages/core/agent_framework/_agents.py @@ -444,6 +444,49 @@ def get_session(self, service_session_id: str, *, session_id: str | None = None) """ return AgentSession(session_id=session_id, service_session_id=service_session_id) + def as_eval_source( + self, + *, + include_instructions: bool = True, + include_tools: bool = True, + include_context_providers: bool = False, + include_examples: bool = False, + examples: Sequence[str] | None = None, + ) -> str: + """Render this agent as a textual dossier for rubric-evaluator generation. + + Packages the agent's name, description, instructions, tool + definitions, and optional context-provider class names into a + single plain-text dossier suitable for passing to a rubric + generation pipeline (e.g. ``FoundryEvals.generate_rubric``). + + Defaults are conservative: instructions and tools are included; + examples and context-provider class names are not. + + Keyword Args: + include_instructions: Whether to include the agent's + instructions text. + include_tools: Whether to include tool definitions. + include_context_providers: Whether to include attached + context-provider class names. + include_examples: Whether to include the supplied ``examples``. + examples: Sample queries / interactions to include when + ``include_examples`` is true. + + Returns: + A plain-text dossier describing the agent. + """ + from ._evaluation import _render_agent_dossier # pyright: ignore[reportPrivateUsage] + + return _render_agent_dossier( + self, + include_instructions=include_instructions, + include_tools=include_tools, + include_context_providers=include_context_providers, + include_examples=include_examples, + examples=examples, + ) + async def _run_after_providers( self, *, diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py index 9eb8c4393df..48704d3543c 100644 --- a/python/packages/core/agent_framework/_evaluation.py +++ b/python/packages/core/agent_framework/_evaluation.py @@ -311,8 +311,8 @@ class EvalScoreResult: score: Numeric score from the evaluator. passed: Whether the item passed this evaluator's threshold. sample: Optional raw evaluator output (rationale, metadata). - dimensions: Per-dimension scores for rubric-based evaluators. - ``None`` for non-rubric (e.g. built-in) evaluators. + dimensions: Per-dimension scores when this evaluator is a rubric + evaluator. ``None`` for non-rubric (e.g. built-in) evaluators. """ name: str @@ -560,7 +560,7 @@ def assert_dimension_score_at_least( produced. Args: - dimension_id: Dimension id (matches :attr:`RubricDimension.id`). + dimension_id: Dimension id (matches the rubric definition). min_score: Minimum acceptable dimension score (inclusive). evaluator: When set, only consider scores from the evaluator whose ``EvalScoreResult.name`` matches. @@ -654,14 +654,13 @@ def _check(results: EvalResults) -> None: class RubricScore: """A single dimension's score from a rubric-based evaluator run. - Rubric evaluators (e.g. Foundry's generated rubric evaluators) emit - one ``RubricScore`` per dimension per item. Attached to - :class:`EvalScoreResult` as a typed view of the raw - ``properties.rubric_scores`` payload. + Rubric evaluators emit one ``RubricScore`` per dimension per item. + Attached to :class:`EvalScoreResult` as a typed view of the raw + ``properties.rubric_scores`` payload returned by providers such as + Foundry's generated rubric evaluators. Attributes: - id: Stable identifier for the dimension (e.g. - ``"policy_enforcement"``) defined by the rubric. + id: Dimension id (matches the rubric definition). score: Numeric score, or ``None`` when the dimension was marked non-applicable for this item. applicable: Whether the dimension applied to this item. @@ -676,6 +675,140 @@ class RubricScore: reason: str +# endregion + +# region Eval source rendering + + +def _render_agent_dossier( + agent: Any, + *, + include_instructions: bool, + include_tools: bool, + include_context_providers: bool, + include_examples: bool, + examples: Sequence[str] | None, +) -> str: + """Render a structured, plain-text dossier of an agent for rubric generation.""" + lines: list[str] = [] + name = getattr(agent, "name", None) or "" + description = getattr(agent, "description", None) + lines.append(f"Agent name: {name}") + if description: + lines.append(f"Description: {description}") + + if include_instructions: + instructions: str | None = None + default_options: Any = getattr(agent, "default_options", None) + if isinstance(default_options, dict): + raw_instr: Any = cast("dict[str, Any]", default_options).get("instructions") + if isinstance(raw_instr, str) and raw_instr.strip(): + instructions = raw_instr + if instructions is None: + raw_instr = getattr(agent, "instructions", None) + if isinstance(raw_instr, str) and raw_instr.strip(): + instructions = raw_instr + if instructions: + lines.append("") + lines.append("Instructions:") + lines.append(instructions.strip()) + + if include_tools: + tool_defs = AgentEvalConverter.extract_tools(agent) + if tool_defs: + lines.append("") + lines.append("Tools:") + for tool in tool_defs: + tool_line = f"- {tool['name']}" + tool_desc = tool.get("description") + if tool_desc: + tool_line += f": {tool_desc}" + lines.append(tool_line) + params = tool.get("parameters") + if params: + try: + params_json = json.dumps(params, sort_keys=True) + except (TypeError, ValueError): + params_json = str(params) + lines.append(f" parameters: {params_json}") + + if include_context_providers: + providers = getattr(agent, "context_providers", None) + if providers: + lines.append("") + lines.append("Context providers:") + for provider in providers: + lines.append(f"- {type(provider).__name__}") + + if include_examples and examples: + lines.append("") + lines.append("Examples:") + for idx, example in enumerate(examples, start=1): + lines.append(f"{idx}. {example}") + + return "\n".join(lines).strip() + + +def _render_workflow_dossier( # pyright: ignore[reportUnusedFunction] + workflow: Workflow, + *, + include_instructions: bool, + include_tools: bool, + include_context_providers: bool, + include_examples: bool, + examples: Sequence[str] | None, + include_topology: bool, +) -> str: + """Render a structured, plain-text dossier of a workflow for rubric generation.""" + from ._workflows._agent_executor import AgentExecutor as _AE + + lines: list[str] = [] + name = workflow.name or "" + lines.append(f"Workflow name: {name}") + if workflow.description: + lines.append(f"Description: {workflow.description}") + + if include_topology: + try: + topology = json.dumps(workflow.to_dict(), sort_keys=True, default=str) + except (TypeError, ValueError) as exc: + logger.debug("Workflow.to_dict() failed during eval source export: %s", exc) + topology = None + if topology: + lines.append("") + lines.append("Topology (JSON):") + lines.append(topology) + + agent_executors: list[tuple[str, Any]] = [] + for executor_id, executor in workflow.executors.items(): + if isinstance(executor, _AE): + agent_executors.append((executor_id, executor.agent)) + + if agent_executors: + lines.append("") + lines.append("Agents:") + for executor_id, agent in agent_executors: + lines.append("") + lines.append(f"Executor: {executor_id}") + dossier = _render_agent_dossier( + agent, + include_instructions=include_instructions, + include_tools=include_tools, + include_context_providers=include_context_providers, + include_examples=False, + examples=None, + ) + lines.append(dossier) + + if include_examples and examples: + lines.append("") + lines.append("Examples:") + for idx, example in enumerate(examples, start=1): + lines.append(f"{idx}. {example}") + + return "\n".join(lines).strip() + + # endregion # region Evaluator protocol diff --git a/python/packages/core/agent_framework/_workflows/_workflow.py b/python/packages/core/agent_framework/_workflows/_workflow.py index 0493cd015f3..bce7569ef1a 100644 --- a/python/packages/core/agent_framework/_workflows/_workflow.py +++ b/python/packages/core/agent_framework/_workflows/_workflow.py @@ -410,6 +410,55 @@ def to_json(self) -> str: """Serialize the workflow definition to JSON.""" return json.dumps(self.to_dict()) + def as_eval_source( + self, + *, + include_instructions: bool = True, + include_tools: bool = True, + include_context_providers: bool = False, + include_examples: bool = False, + examples: Sequence[str] | None = None, + include_topology: bool = True, + ) -> str: + """Render this workflow as a textual dossier for rubric-evaluator generation. + + Produces a plain-text dossier containing the workflow's name, + description, optional JSON-encoded topology (from + :meth:`Workflow.to_dict`), and per-agent dossiers extracted from + ``AgentExecutor`` nodes. Suitable for passing to a rubric + generation pipeline (e.g. ``FoundryEvals.generate_rubric``). + + Defaults are conservative: per-agent instructions and tools are + included, plus the JSON-encoded topology. Examples and + context-provider class names are excluded by default. + + Keyword Args: + include_instructions: Per-agent instructions inclusion. + include_tools: Per-agent tool-definition inclusion. + include_context_providers: Per-agent context-provider + inclusion. + include_examples: Whether to include workflow-level + ``examples``. + examples: Sample queries / interactions to include when + ``include_examples`` is true. + include_topology: Whether to embed the JSON-encoded workflow + topology in the rendered dossier. + + Returns: + A plain-text dossier describing the workflow. + """ + from .._evaluation import _render_workflow_dossier # pyright: ignore[reportPrivateUsage] + + return _render_workflow_dossier( + self, + include_instructions=include_instructions, + include_tools=include_tools, + include_context_providers=include_context_providers, + include_examples=include_examples, + examples=examples, + include_topology=include_topology, + ) + def get_start_executor(self) -> Executor: """Get the starting executor of the workflow. diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py index 27c413b7151..c13b107c4bd 100644 --- a/python/packages/core/tests/core/test_local_eval.py +++ b/python/packages/core/tests/core/test_local_eval.py @@ -1011,119 +1011,199 @@ def test_all_passed_parent_fails_when_own_counts_fail(self): # --------------------------------------------------------------------------- -# r5 review: _build_overall_item with empty outputs +# Eval source rendering (string dossiers) # --------------------------------------------------------------------------- -class TestBuildOverallItemEmpty: - """Test _build_overall_item returns None for empty workflow outputs.""" +class TestAgentAsEvalSource: + """Tests for BaseAgent.as_eval_source / _render_agent_dossier.""" - def test_returns_none_for_empty_outputs(self): + def _make_mock_agent( + self, + *, + name: str = "weather-bot", + description: str | None = "Looks up the weather.", + instructions: str | None = "Be concise. Always cite the source.", + tools: list[Any] | None = None, + context_providers: list[Any] | None = None, + mcp_tools: list[Any] | None = None, + ) -> Any: from unittest.mock import MagicMock - from agent_framework._evaluation import _build_overall_item - - mock_result = MagicMock() - mock_result.get_outputs.return_value = [] - item = _build_overall_item("Hello", mock_result) - assert item is None - - -class TestRubricAssertions: - """Tests for EvalResults rubric assertion helpers.""" - - def _build_results(self, item_scores: list[list[tuple[str, float, list[Any] | None]]]) -> Any: - from agent_framework._evaluation import EvalItemResult, EvalResults, EvalScoreResult - - items: list[EvalItemResult] = [] - for i, scores in enumerate(item_scores): - items.append( - EvalItemResult( - item_id=f"oi_{i}", - status="pass", - scores=[EvalScoreResult(name=name, score=score, dimensions=dims) for name, score, dims in scores], - ) - ) - return EvalResults( - provider="Local", - status="completed", - result_counts={"passed": len(items), "failed": 0, "errored": 0}, - items=items, + from agent_framework._tools import ai_function + + agent = MagicMock() + agent.name = name + agent.description = description + agent.default_options = {"instructions": instructions, "tools": tools or []} + agent.context_providers = context_providers or [] + agent.mcp_tools = mcp_tools or [] + if tools: + normalized: list[Any] = [] + for t in tools: + if callable(t) and not hasattr(t, "parameters"): + normalized.append(ai_function(t)) + else: + normalized.append(t) + agent.default_options["tools"] = normalized + return agent + + def _render(self, agent: Any, **overrides: Any) -> str: + from agent_framework._evaluation import _render_agent_dossier + + kwargs: dict[str, Any] = { + "include_instructions": True, + "include_tools": True, + "include_context_providers": False, + "include_examples": False, + "examples": None, + } + kwargs.update(overrides) + return _render_agent_dossier(agent, **kwargs) + + def test_basic_dossier_includes_name_and_instructions(self): + agent = self._make_mock_agent() + dossier = self._render(agent) + assert isinstance(dossier, str) + assert "Agent name: weather-bot" in dossier + assert "Description: Looks up the weather." in dossier + assert "Instructions:" in dossier + assert "Be concise." in dossier + + def test_tools_section_includes_definitions(self): + def get_weather(city: str) -> str: + """Return the current weather for *city*.""" + return f"sunny in {city}" + + agent = self._make_mock_agent(tools=[get_weather]) + dossier = self._render(agent) + assert "Tools:" in dossier + assert "- get_weather" in dossier + assert '"city"' in dossier + + def test_include_instructions_false_omits_section(self): + agent = self._make_mock_agent() + dossier = self._render(agent, include_instructions=False) + assert "Instructions:" not in dossier + + def test_include_tools_false_omits_section(self): + def get_weather(city: str) -> str: + return f"sunny in {city}" + + agent = self._make_mock_agent(tools=[get_weather]) + dossier = self._render(agent, include_tools=False) + assert "Tools:" not in dossier + + def test_context_providers_excluded_by_default_but_included_when_opted_in(self): + class StubProvider: + pass + + agent = self._make_mock_agent(context_providers=[StubProvider()]) + default_dossier = self._render(agent) + assert "Context providers:" not in default_dossier + + opt_in_dossier = self._render(agent, include_context_providers=True) + assert "Context providers:" in opt_in_dossier + assert "- StubProvider" in opt_in_dossier + + def test_examples_excluded_by_default_but_included_when_opted_in(self): + agent = self._make_mock_agent() + default_dossier = self._render(agent, examples=["What's the weather in NYC?"]) + assert "Examples:" not in default_dossier + + opt_in_dossier = self._render( + agent, + include_examples=True, + examples=["What's the weather in NYC?"], ) + assert "Examples:" in opt_in_dossier + assert "What's the weather in NYC?" in opt_in_dossier - def test_assert_score_at_least_passes(self): - results = self._build_results([[("relevance", 0.9, None)], [("relevance", 0.85, None)]]) - results.assert_score_at_least(0.8) - - def test_assert_score_at_least_raises(self): - from agent_framework._evaluation import EvalNotPassedError - - results = self._build_results([[("relevance", 0.9, None)], [("relevance", 0.5, None)]]) - with pytest.raises(EvalNotPassedError, match="below threshold"): - results.assert_score_at_least(0.8) - - def test_assert_score_at_least_filtered_by_evaluator(self): - from agent_framework._evaluation import EvalNotPassedError - - results = self._build_results([[("relevance", 0.9, None), ("coherence", 0.3, None)]]) - # Coherence is low — only fails when not filtered out. - results.assert_score_at_least(0.8, evaluator="relevance") - with pytest.raises(EvalNotPassedError): - results.assert_score_at_least(0.8, evaluator="coherence") - - def test_assert_dimension_score_at_least(self): - from agent_framework._evaluation import EvalNotPassedError, RubricScore + def test_base_agent_method_returns_dossier_string(self): + from agent_framework._agents import BaseAgent - dims_pass = [ - RubricScore(id="policy", score=4, applicable=True, weight=1, reason="ok"), - RubricScore(id="safety", score=5, applicable=True, weight=1, reason="ok"), - ] - dims_fail = [ - RubricScore(id="policy", score=2, applicable=True, weight=1, reason="bad"), - ] - results = self._build_results([[("rubric", 0.9, dims_pass)], [("rubric", 0.5, dims_fail)]]) - # Safety passes everywhere — no raise. - results.assert_dimension_score_at_least("safety", 4) - # Policy fails on the second item. - with pytest.raises(EvalNotPassedError, match="policy"): - results.assert_dimension_score_at_least("policy", 3) - - def test_assert_dimension_skips_non_applicable_by_default(self): - from agent_framework._evaluation import RubricScore - - dims = [ - RubricScore(id="optional", score=None, applicable=False, weight=1, reason="n/a"), - ] - results = self._build_results([[("rubric", 0.9, dims)]]) - # No applicable scores — should not raise. - results.assert_dimension_score_at_least("optional", 3) + class _ConcreteAgent(BaseAgent): + pass - def test_assert_dimension_require_applicable_raises(self): - from agent_framework._evaluation import EvalNotPassedError, RubricScore + agent = _ConcreteAgent(name="test-agent", description="A test agent.") + dossier = agent.as_eval_source() + assert isinstance(dossier, str) + assert "Agent name: test-agent" in dossier - dims = [ - RubricScore(id="optional", score=None, applicable=False, weight=1, reason="n/a"), - ] - results = self._build_results([[("rubric", 0.9, dims)]]) - with pytest.raises(EvalNotPassedError, match="not applicable"): - results.assert_dimension_score_at_least("optional", 3, require_applicable=True) - def test_assert_no_failed_items(self): - from agent_framework._evaluation import EvalItemResult, EvalNotPassedError, EvalResults - - results = EvalResults( - provider="Local", - status="completed", - result_counts={"passed": 1, "failed": 1, "errored": 0}, - items=[ - EvalItemResult(item_id="oi_pass", status="pass"), - EvalItemResult(item_id="oi_fail", status="fail"), - ], - ) - with pytest.raises(EvalNotPassedError, match="failed"): - results.assert_no_failed_items() +class TestWorkflowAsEvalSource: + """Tests for Workflow.as_eval_source / _render_workflow_dossier.""" + def _build_workflow(self, *, with_agent: bool = False) -> Any: + from unittest.mock import MagicMock -# --------------------------------------------------------------------------- -# r5 review: _build_overall_item with empty outputs -# --------------------------------------------------------------------------- + from agent_framework._workflows._agent_executor import AgentExecutor + + workflow = MagicMock() + workflow.name = "demo-workflow" + workflow.description = "Routes user questions through a single agent." + workflow.to_dict.return_value = { + "name": "demo-workflow", + "id": "wf_1", + "start_executor_id": "agent_1", + "edge_groups": [], + "executors": {"agent_1": {"type": "AgentExecutor"}}, + } + + if with_agent: + inner_agent = MagicMock() + inner_agent.name = "inner-agent" + inner_agent.description = "Inner agent." + inner_agent.default_options = {"instructions": "Answer politely.", "tools": []} + inner_agent.context_providers = [] + inner_agent.mcp_tools = [] + + executor = MagicMock(spec=AgentExecutor) + executor.agent = inner_agent + workflow.executors = {"agent_1": executor} + else: + workflow.executors = {} + return workflow + + def _render(self, workflow: Any, **overrides: Any) -> str: + from agent_framework._evaluation import _render_workflow_dossier + + kwargs: dict[str, Any] = { + "include_instructions": True, + "include_tools": True, + "include_context_providers": False, + "include_examples": False, + "examples": None, + "include_topology": True, + } + kwargs.update(overrides) + return _render_workflow_dossier(workflow, **kwargs) + + def test_emits_dossier_with_topology(self): + workflow = self._build_workflow() + dossier = self._render(workflow) + assert isinstance(dossier, str) + assert "Workflow name: demo-workflow" in dossier + assert "Topology (JSON):" in dossier + assert '"start_executor_id": "agent_1"' in dossier + + def test_topology_can_be_disabled(self): + workflow = self._build_workflow() + dossier = self._render(workflow, include_topology=False) + assert "Topology (JSON):" not in dossier + + def test_per_agent_dossiers_included_when_executor_is_agent_executor(self): + workflow = self._build_workflow(with_agent=True) + dossier = self._render(workflow) + assert "Agents:" in dossier + assert "Executor: agent_1" in dossier + assert "Agent name: inner-agent" in dossier + assert "Answer politely." in dossier + + def test_workflow_examples_excluded_by_default(self): + workflow = self._build_workflow() + default_dossier = self._render(workflow, examples=["Hi"]) + assert "Examples:" not in default_dossier + + opt_in_dossier = self._render(workflow, examples=["Hi"], include_examples=True) + assert "Examples:" in opt_in_dossier From a9e46765ea5e94877c8525e4a62db13396c4ae7c Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Wed, 27 May 2026 08:21:06 -0700 Subject: [PATCH 05/16] Python: feat(foundry-evals): EvalGenerationSource + generate_rubric helper Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../agent_framework_foundry/__init__.py | 6 + .../agent_framework_foundry/_foundry_evals.py | 498 +++++++++++++++++- .../foundry/tests/test_foundry_evals.py | 459 +++++++++++++++- 3 files changed, 936 insertions(+), 27 deletions(-) diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py index 14eebfaffa0..cafe30eb955 100644 --- a/python/packages/foundry/agent_framework_foundry/__init__.py +++ b/python/packages/foundry/agent_framework_foundry/__init__.py @@ -11,11 +11,14 @@ RawFoundryEmbeddingClient, ) from ._foundry_evals import ( + EvalGenerationSource, FoundryEvals, GeneratedEvaluatorRef, RubricDimension, + agent_as_eval_source, evaluate_foundry_target, evaluate_traces, + workflow_as_eval_source, ) from ._memory_provider import FoundryMemoryProvider @@ -25,6 +28,7 @@ __version__ = "0.0.0" __all__ = [ + "EvalGenerationSource", "FoundryAgent", "FoundryAgentOptions", "FoundryChatClient", @@ -41,6 +45,8 @@ "RawFoundryEmbeddingClient", "RubricDimension", "__version__", + "agent_as_eval_source", "evaluate_foundry_target", "evaluate_traces", + "workflow_as_eval_source", ] diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index b6e3530654c..0d83d8b1bc3 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -48,6 +48,8 @@ from ._chat_client import FoundryChatClient if TYPE_CHECKING: + from agent_framework._agents import BaseAgent + from agent_framework._workflows._workflow import Workflow from azure.ai.projects.aio import AIProjectClient from openai.types.evals import RunRetrieveResponse @@ -60,12 +62,12 @@ @experimental(feature_id=ExperimentalFeature.EVALS) @dataclass(frozen=True) class RubricDimension: - """A single dimension of a Foundry generated rubric evaluator. + """A single dimension of a generated rubric evaluator. Rubric evaluators score each item along one or more named dimensions, each with its own description and weight. Foundry's evaluator generation pipeline produces these dimensions from agent/workflow - metadata; agent-framework surfaces them so callers can inspect a + metadata; ``RubricDimension`` surfaces them so callers can inspect a generated evaluator's structure without round-tripping through the portal. @@ -152,8 +154,164 @@ def latest( ) -# endregion +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class EvalGenerationSource: + """A source description passed to Foundry's evaluator generation pipeline. + + Rubric evaluator generation consumes one or more sources that describe + the agent or workflow under evaluation. ``FoundryEvals`` translates + instances into the underlying ``*EvaluatorGenerationJobSource`` SDK + types. + + Discriminated by :attr:`type`: + + * ``"prompt"`` - a free-form textual dossier (typical for local agents + and workflows whose tools cannot be fetched server-side). + * ``"agent"`` - a hosted Foundry agent referenced by name so the + service fetches tool definitions and metadata directly. + * ``"dataset"`` - a Foundry dataset of recorded interactions. + * ``"traces"`` - tracing data scoped by metadata. + + Only the fields relevant to :attr:`type` are populated; the remaining + fields stay ``None``. + + Attributes: + type: Source kind. See discriminator above. + description: Optional short description shown in Foundry UI. + prompt: Rendered dossier for ``type="prompt"`` sources. + agent_name: Hosted Foundry agent name for ``type="agent"`` sources. + dataset_name: Foundry dataset name for ``type="dataset"`` sources. + dataset_version: Pinned dataset version (recommended for repro). + metadata: Free-form metadata. Used by ``type="traces"`` sources + for tracing-attribute filters and as a generic escape hatch + for additional fields not yet modeled. + """ + + type: Literal["prompt", "dataset", "agent", "traces"] + description: str | None = None + prompt: str | None = None + agent_name: str | None = None + dataset_name: str | None = None + dataset_version: str | None = None + metadata: dict[str, Any] | None = None + +@experimental(feature_id=ExperimentalFeature.EVALS) +def agent_as_eval_source( + agent: BaseAgent, + *, + include_instructions: bool = True, + include_tools: bool = True, + include_context_providers: bool = False, + include_examples: bool = False, + examples: Sequence[str] | None = None, + hosted_agent_name: str | None = None, +) -> EvalGenerationSource: + """Render an agent as an :class:`EvalGenerationSource` for rubric generation. + + Wraps :meth:`BaseAgent.as_eval_source` to package the agent's + rendered dossier into a typed Foundry generation source. When + ``hosted_agent_name`` is provided, returns a ``type="agent"`` source + referencing the hosted Foundry agent so the service fetches + server-side metadata directly instead of using a rendered dossier. + + Args: + agent: Agent instance (typically a ``BaseAgent`` subclass). + include_instructions: Whether to include the agent's instructions + text. Defaults to ``True``. + include_tools: Whether to include tool definitions. Defaults to + ``True``. + include_context_providers: Whether to include the names of + attached context-provider classes. Defaults to ``False`` to + avoid leaking implementation details. + include_examples: Whether to include the supplied ``examples``. + Defaults to ``False`` to avoid shipping potentially sensitive + sample inputs by default. + examples: Optional sample queries / interactions to include when + ``include_examples`` is ``True``. + hosted_agent_name: When set, emit a ``type="agent"`` source + referencing the hosted Foundry agent by name instead of a + rendered dossier. + + Returns: + An :class:`EvalGenerationSource` describing the agent. + """ + if hosted_agent_name: + agent_description = getattr(agent, "description", None) + return EvalGenerationSource( + type="agent", + agent_name=hosted_agent_name, + description=agent_description, + ) + + prompt = agent.as_eval_source( + include_instructions=include_instructions, + include_tools=include_tools, + include_context_providers=include_context_providers, + include_examples=include_examples, + examples=examples, + ) + agent_description = getattr(agent, "description", None) + return EvalGenerationSource( + type="prompt", + prompt=prompt, + description=agent_description, + ) + + +@experimental(feature_id=ExperimentalFeature.EVALS) +def workflow_as_eval_source( + workflow: Workflow, + *, + include_instructions: bool = True, + include_tools: bool = True, + include_context_providers: bool = False, + include_examples: bool = False, + examples: Sequence[str] | None = None, + include_topology: bool = True, +) -> EvalGenerationSource: + """Render a workflow as an :class:`EvalGenerationSource` for rubric generation. + + Wraps :meth:`Workflow.as_eval_source` to package the workflow's + rendered dossier (workflow name, description, topology, per-agent + dossiers) into a typed ``type="prompt"`` Foundry generation source. + + Args: + workflow: Workflow instance to render. + include_instructions: Per-agent instructions inclusion. + include_tools: Per-agent tools inclusion. + include_context_providers: Per-agent context-provider inclusion. + Defaults to ``False``. + include_examples: Per-agent examples inclusion. Defaults to + ``False``. + examples: Optional workflow-level sample queries. Rendered into + a top-level ``Examples:`` section when ``include_examples`` is + ``True``. + include_topology: Whether to embed the JSON-encoded workflow + topology produced by :meth:`Workflow.to_dict`. Defaults to + ``True``. + + Returns: + A ``type="prompt"`` :class:`EvalGenerationSource` describing the + workflow. + """ + prompt = workflow.as_eval_source( + include_instructions=include_instructions, + include_tools=include_tools, + include_context_providers=include_context_providers, + include_examples=include_examples, + examples=examples, + include_topology=include_topology, + ) + return EvalGenerationSource( + type="prompt", + prompt=prompt, + description=workflow.description, + ) + + +# endregion # Agent evaluators that accept query/response as conversation arrays. # Maintained manually — check https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk # for the latest evaluator list. These are the evaluators that need conversation-format input. @@ -307,6 +465,8 @@ def _build_testing_criteria( entry_spec.name, ) if include_data_mapping: + # Rubric evaluators accept conversation arrays like agent + # evaluators, plus tool_definitions when items are tool-aware. ref_mapping: dict[str, str] = { "query": "{{item.query_messages}}", "response": "{{item.response_messages}}", @@ -498,7 +658,6 @@ def _extract_per_evaluator(run: RunRetrieveResponse) -> dict[str, dict[str, int] return per_eval - def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None: """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload. @@ -978,6 +1137,329 @@ async def _evaluate_via_dataset( provider=self.name, ) + @classmethod + @experimental(feature_id=ExperimentalFeature.EVALS) + async def generate_rubric( + cls, + *, + project_client: AIProjectClient, + name: str, + agent: BaseAgent | None = None, + workflow: Workflow | None = None, + sources: Sequence[EvalGenerationSource] | None = None, + category: str = "quality", + model: str | None = None, + display_name: str | None = None, + description: str | None = None, + operation_id: str | None = None, + poll_interval: float = 5.0, + timeout: float = 600.0, + ) -> GeneratedEvaluatorRef: + """Generate a Foundry rubric evaluator from an agent or workflow. + + Drives the Foundry evaluator-generation long-running operation + (``client.beta.evaluators.create_generation_job``) end-to-end and + returns a pinned :class:`GeneratedEvaluatorRef` for use with + :class:`FoundryEvals` ``evaluators=`` lists. + + Exactly one of ``agent``, ``workflow``, or ``sources`` must be + supplied. When ``agent`` or ``workflow`` is given, + :func:`agent_as_eval_source` / :func:`workflow_as_eval_source` is + used to build a single conservative source (instructions and + tools included; examples and context providers excluded). Pass + ``sources=`` directly to control inclusion explicitly or to + provide multiple sources. + + Requires ``azure-ai-projects`` with the rubric-generation APIs + (currently ``2.3.0a*`` on the Azure SDK dev feed; tracked for an + upcoming PyPI release). Raises :class:`NotImplementedError` with + a clear message when the dependency is unavailable. + + Keyword Args: + project_client: Async ``AIProjectClient`` for the target + Foundry project. + name: Evaluator name to register in the project. Must be a + stable identifier (e.g. ``"policy-enforcement-v1"``). + agent: Optional ``BaseAgent`` to derive a source from. + workflow: Optional ``Workflow`` to derive a source from. + sources: Explicit list of :class:`EvalGenerationSource` + instances. Mutually exclusive with ``agent`` / ``workflow``. + category: ``"quality"`` or ``"safety"``. Defaults to + ``"quality"``. + model: Optional model deployment to drive generation. When + omitted the service picks a default. + display_name: Optional human-readable name for the evaluator. + description: Optional description for the evaluator. + operation_id: Optional caller-supplied operation id to make + the create call idempotent. + poll_interval: Seconds between job-status polls. + timeout: Maximum seconds to wait for the job to complete. + + Returns: + A pinned :class:`GeneratedEvaluatorRef` referring to the + newly created evaluator. + + Raises: + ValueError: If the source arguments are inconsistent. + NotImplementedError: If the installed ``azure-ai-projects`` + version does not expose the rubric APIs. + TimeoutError: If the job does not complete within ``timeout``. + RuntimeError: If the generation job ends in a non-succeeded + terminal state. + """ + resolved_sources = _coalesce_generation_sources(agent=agent, workflow=workflow, sources=sources) + + try: + sdk_types = _import_generation_sdk_types() + except _RubricSdkUnavailableError as exc: + raise NotImplementedError(str(exc)) from exc + + sdk_sources = [_to_sdk_source(s, sdk_types) for s in resolved_sources] + + inputs_kwargs: dict[str, Any] = { + "name": name, + "category": category, + "sources": sdk_sources, + } + if model is not None: + inputs_kwargs["model"] = model + if display_name is not None: + inputs_kwargs["display_name"] = display_name + if description is not None: + inputs_kwargs["description"] = description + + inputs = sdk_types.EvaluatorGenerationInputs(**inputs_kwargs) + job = sdk_types.EvaluatorGenerationJob(inputs=inputs) + + create_kwargs: dict[str, Any] = {"job": job} + if operation_id is not None: + create_kwargs["operation_id"] = operation_id + + evaluators_ops = _get_beta_evaluators(project_client) + created = await evaluators_ops.create_generation_job(**create_kwargs) + completed = await _poll_generation_job( + evaluators_ops, + created, + poll_interval=poll_interval, + timeout=timeout, + ) + + return _generation_job_to_ref(completed, category=category) + + +_TERMINAL_GENERATION_STATUSES: frozenset[str] = frozenset({"succeeded", "failed", "cancelled", "canceled"}) + + +class _RubricSdkUnavailableError(Exception): + """Raised when azure-ai-projects lacks the rubric-generation APIs.""" + + +@dataclass(frozen=True) +class _GenerationSdkTypes: + """Resolved SDK type handles for rubric-evaluator generation.""" + + EvaluatorGenerationInputs: Any + EvaluatorGenerationJob: Any + PromptSource: Any + AgentSource: Any | None + DatasetSource: Any | None + TracesSource: Any | None + + +_RUBRIC_SDK_MISSING_MSG = ( + "FoundryEvals.generate_rubric requires the rubric-evaluator generation APIs " + "from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev feed). " + "Install a build that exposes " + "`azure.ai.projects.models.EvaluatorGenerationInputs` and " + "`AIProjectClient.beta.evaluators.create_generation_job`." +) + + +def _import_generation_sdk_types() -> _GenerationSdkTypes: + """Lazily resolve the rubric-generation SDK types from azure-ai-projects.""" + try: + from azure.ai.projects import models as _models # type: ignore[import-not-found] + except ImportError as exc: + raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG) from exc + + models_mod: Any = _models + inputs_cls: Any = getattr(models_mod, "EvaluatorGenerationInputs", None) + job_cls: Any = getattr(models_mod, "EvaluatorGenerationJob", None) + prompt_cls: Any = getattr(models_mod, "PromptEvaluatorGenerationJobSource", None) + if inputs_cls is None or job_cls is None or prompt_cls is None: + raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG) + + agent_cls: Any = getattr(models_mod, "AgentEvaluatorGenerationJobSource", None) + dataset_cls: Any = getattr(models_mod, "DatasetEvaluatorGenerationJobSource", None) + traces_cls: Any = getattr(models_mod, "TracesEvaluatorGenerationJobSource", None) + + return _GenerationSdkTypes( + EvaluatorGenerationInputs=inputs_cls, + EvaluatorGenerationJob=job_cls, + PromptSource=prompt_cls, + AgentSource=agent_cls, + DatasetSource=dataset_cls, + TracesSource=traces_cls, + ) + + +def _get_beta_evaluators(project_client: AIProjectClient) -> Any: + """Return the ``project_client.beta.evaluators`` operations group, or raise.""" + beta = getattr(project_client, "beta", None) + evaluators_ops = getattr(beta, "evaluators", None) if beta is not None else None + if evaluators_ops is None: + raise NotImplementedError(_RUBRIC_SDK_MISSING_MSG) + return evaluators_ops + + +def _coalesce_generation_sources( + *, + agent: BaseAgent | None, + workflow: Workflow | None, + sources: Sequence[EvalGenerationSource] | None, +) -> list[EvalGenerationSource]: + if sources is not None and not sources: + raise ValueError("sources= must contain at least one EvalGenerationSource.") + supplied = [bool(agent), bool(workflow), bool(sources)] + if sum(supplied) == 0: + raise ValueError("Provide one of agent=, workflow=, or sources=.") + if sum(supplied) > 1: + raise ValueError("Provide only one of agent=, workflow=, or sources=.") + if sources is not None: + return list(sources) + if agent is not None: + return [agent_as_eval_source(agent)] + if workflow is None: + raise ValueError("workflow= must be provided when agent= and sources= are not set.") + return [workflow_as_eval_source(workflow)] + + +def _to_sdk_source(source: EvalGenerationSource, sdk_types: _GenerationSdkTypes) -> Any: + """Translate an :class:`EvalGenerationSource` to its SDK counterpart.""" + if source.type == "prompt": + if not source.prompt: + raise ValueError("EvalGenerationSource(type='prompt') requires a non-empty prompt.") + kwargs: dict[str, Any] = {"prompt": source.prompt} + if source.description is not None: + kwargs["description"] = source.description + return sdk_types.PromptSource(**kwargs) + if source.type == "agent": + if sdk_types.AgentSource is None: + raise NotImplementedError("Installed azure-ai-projects does not expose AgentEvaluatorGenerationJobSource.") + if not source.agent_name: + raise ValueError("EvalGenerationSource(type='agent') requires agent_name.") + kwargs = {"agent_name": source.agent_name} + if source.description is not None: + kwargs["description"] = source.description + return sdk_types.AgentSource(**kwargs) + if source.type == "dataset": + if sdk_types.DatasetSource is None: + raise NotImplementedError( + "Installed azure-ai-projects does not expose DatasetEvaluatorGenerationJobSource." + ) + if not source.dataset_name: + raise ValueError("EvalGenerationSource(type='dataset') requires dataset_name.") + kwargs = {"dataset_name": source.dataset_name} + if source.dataset_version is not None: + kwargs["dataset_version"] = source.dataset_version + if source.description is not None: + kwargs["description"] = source.description + return sdk_types.DatasetSource(**kwargs) + if source.type == "traces": + if sdk_types.TracesSource is None: + raise NotImplementedError("Installed azure-ai-projects does not expose TracesEvaluatorGenerationJobSource.") + kwargs = {} + if source.metadata is not None: + kwargs["metadata"] = source.metadata + if source.description is not None: + kwargs["description"] = source.description + return sdk_types.TracesSource(**kwargs) + raise ValueError(f"Unknown EvalGenerationSource type: {source.type!r}") + + +async def _poll_generation_job( + evaluators_ops: Any, + job: Any, + *, + poll_interval: float, + timeout: float, +) -> Any: + """Poll a rubric-generation job until it reaches a terminal state.""" + job_id = getattr(job, "id", None) + if not job_id: + raise RuntimeError("Rubric generation job did not return an id.") + + deadline = asyncio.get_event_loop().time() + timeout + current = job + while True: + status = (getattr(current, "status", "") or "").lower() + if status in _TERMINAL_GENERATION_STATUSES: + if status != "succeeded": + err = getattr(current, "error", None) + err_msg = getattr(err, "message", None) or str(err) if err is not None else status + raise RuntimeError(f"Rubric generation job {job_id} ended in status {status!r}: {err_msg}") + return current + if asyncio.get_event_loop().time() >= deadline: + raise TimeoutError( + f"Rubric generation job {job_id} did not complete within {timeout}s (last status: {status!r})." + ) + await asyncio.sleep(poll_interval) + current = await evaluators_ops.get_generation_job(job_id) + + +def _generation_job_to_ref(job: Any, *, category: str) -> GeneratedEvaluatorRef: + """Build a pinned :class:`GeneratedEvaluatorRef` from a completed job.""" + artifacts: Any = getattr(job, "artifacts", None) + evaluator: Any = getattr(artifacts, "evaluator", None) if artifacts is not None else None + if evaluator is None: + raise RuntimeError("Rubric generation job completed without an evaluator artifact.") + + ev_name = getattr(evaluator, "name", None) + ev_version = getattr(evaluator, "version", None) + if not ev_name: + raise RuntimeError("Generated evaluator artifact is missing a name.") + if ev_version is None: + raise RuntimeError("Generated evaluator artifact is missing a version.") + + definition: Any = getattr(evaluator, "definition", None) + dimensions_raw: Any = getattr(definition, "dimensions", None) if definition is not None else None + dimensions: tuple[RubricDimension, ...] | None = None + if dimensions_raw: + parsed: list[RubricDimension] = [] + for entry in dimensions_raw: + try: + parsed.append( + RubricDimension( + id=str(getattr(entry, "id", "") or ""), + description=str(getattr(entry, "description", "") or ""), + weight=int(getattr(entry, "weight", 0) or 0), + always_applicable=bool(getattr(entry, "always_applicable", False)), + ) + ) + except (TypeError, ValueError): + logger.debug("Skipping malformed dimension on generated evaluator", exc_info=True) + if parsed: + dimensions = tuple(parsed) + + pass_threshold: float | None = None + if definition is not None: + raw_threshold = getattr(definition, "pass_threshold", None) + if isinstance(raw_threshold, (int, float)): + pass_threshold = float(raw_threshold) + + valid_category: str + valid_category = category if category in ("quality", "safety") else "quality" + + return GeneratedEvaluatorRef( + name=str(ev_name), + version=str(ev_version), + category=cast("Any", valid_category), + display_name=getattr(evaluator, "display_name", None), + description=getattr(evaluator, "description", None), + dimensions=dimensions, + pass_threshold=pass_threshold, + ) + # --------------------------------------------------------------------------- # Foundry-specific functions (not part of the Evaluator protocol) @@ -987,7 +1469,7 @@ async def _evaluate_via_dataset( @experimental(feature_id=ExperimentalFeature.EVALS) async def evaluate_traces( *, - evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None, + evaluators: Sequence[str] | None = None, client: FoundryChatClient | None = None, project_client: AIProjectClient | None = None, model: str, @@ -1080,7 +1562,7 @@ async def evaluate_foundry_target( *, target: dict[str, Any], test_queries: Sequence[str], - evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None, + evaluators: Sequence[str] | None = None, client: FoundryChatClient | None = None, project_client: AIProjectClient | None = None, model: str, @@ -1096,9 +1578,7 @@ async def evaluate_foundry_target( Args: target: Target configuration dict. test_queries: Queries for Foundry to send to the target. - evaluators: Evaluator names (built-in shorts / fully-qualified - ``builtin.*`` names) or :class:`GeneratedEvaluatorRef` - instances for generated rubric evaluators. + evaluators: Evaluator names. client: A ``FoundryChatClient`` instance. Provide this or *project_client*. project_client: An ``AIProjectClient`` instance. model: Model deployment name for the evaluator LLM judge. diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index 7502726d1ad..16dc9d50ce7 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -64,6 +64,32 @@ def _make_tool(name: str) -> MagicMock: return t +def _make_stub_agent( + *, + name: str = "alpha", + description: str = "An agent.", + instructions: str = "Be brief.", +) -> MagicMock: + """Mock agent whose as_eval_source returns a real dossier string.""" + from agent_framework._evaluation import _render_agent_dossier + + agent = MagicMock() + agent.name = name + agent.description = description + agent.default_options = {"instructions": instructions, "tools": []} + agent.context_providers = [] + agent.mcp_tools = [] + agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier( + agent, + include_instructions=kw.get("include_instructions", True), + include_tools=kw.get("include_tools", True), + include_context_providers=kw.get("include_context_providers", False), + include_examples=kw.get("include_examples", False), + examples=kw.get("examples"), + ) + return agent + + @dataclass class _MockResultCounts: """Mock matching the OpenAI SDK ResultCounts Pydantic model shape.""" @@ -806,12 +832,6 @@ def test_all_tool_evaluators_include_tool_definitions(self) -> None: for c in criteria: assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions" - -# --------------------------------------------------------------------------- -# _build_item_schema -# --------------------------------------------------------------------------- - - def test_generated_evaluator_ref_pinned_version(self) -> None: from agent_framework_foundry import GeneratedEvaluatorRef @@ -1336,12 +1356,6 @@ def test_raises_when_all_filtered(self) -> None: items, ) - -# --------------------------------------------------------------------------- -# EvalResults -# --------------------------------------------------------------------------- - - def test_preserves_generated_ref_when_no_tools(self) -> None: from agent_framework_foundry import GeneratedEvaluatorRef @@ -2473,12 +2487,6 @@ async def test_handles_api_failure_gracefully(self) -> None: items = await _fetch_output_items(mock_client, "eval_1", "run_1") assert items == [] - -# --------------------------------------------------------------------------- -# _poll_eval_run — timeout / failed / canceled paths -# --------------------------------------------------------------------------- - - async def test_extracts_rubric_scores_from_dict_sample(self) -> None: from agent_framework_foundry._foundry_evals import _fetch_output_items @@ -2521,6 +2529,30 @@ async def test_extracts_rubric_scores_from_dict_sample(self) -> None: assert safety.score is None assert safety.applicable is False + async def test_no_rubric_scores_when_absent(self) -> None: + from agent_framework_foundry._foundry_evals import _fetch_output_items + + mock_result = MagicMock() + mock_result.name = "relevance" + mock_result.score = 0.85 + mock_result.passed = True + mock_result.sample = None + + mock_oi = MagicMock() + mock_oi.id = "oi_2" + mock_oi.status = "pass" + mock_oi.results = [mock_result] + mock_oi.sample = None + mock_oi.datasource_item = {} + + mock_client = MagicMock() + mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi])) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + + assert items[0].scores[0].dimensions is None + + class TestExtractRubricScores: def test_handles_attribute_style_properties(self) -> None: from agent_framework_foundry._foundry_evals import _extract_rubric_scores @@ -2962,3 +2994,394 @@ async def test_target_without_type_raises(self) -> None: client=mock_client, model="gpt-4o", ) + + +class TestFoundryAgentAsEvalSource: + """Tests for foundry's agent_as_eval_source helper (wraps BaseAgent.as_eval_source).""" + + def test_returns_prompt_source_with_dossier(self) -> None: + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") + source = agent_as_eval_source(agent) + assert source.type == "prompt" + assert source.description == "Looks up the weather." + assert source.prompt is not None + assert "Agent name: weather-bot" in source.prompt + assert "Be brief." in source.prompt + + def test_hosted_agent_name_emits_agent_source(self) -> None: + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") + source = agent_as_eval_source(agent, hosted_agent_name="weather-bot-hosted-id") + assert source.type == "agent" + assert source.agent_name == "weather-bot-hosted-id" + assert source.prompt is None + assert source.description == "Looks up the weather." + + def test_forwards_keyword_options_to_agent(self) -> None: + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent() + source = agent_as_eval_source(agent, include_instructions=False) + assert source.prompt is not None + assert "Instructions:" not in source.prompt + + +class TestFoundryWorkflowAsEvalSource: + """Tests for foundry's workflow_as_eval_source helper (wraps Workflow.as_eval_source).""" + + def _make_workflow(self) -> MagicMock: + from agent_framework._evaluation import _render_workflow_dossier + + workflow = MagicMock() + workflow.name = "demo-workflow" + workflow.description = "Routes user questions." + workflow.to_dict.return_value = { + "name": "demo-workflow", + "id": "wf_1", + "executors": {}, + "edge_groups": [], + } + workflow.executors = {} + workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier( + workflow, + include_instructions=kw.get("include_instructions", True), + include_tools=kw.get("include_tools", True), + include_context_providers=kw.get("include_context_providers", False), + include_examples=kw.get("include_examples", False), + examples=kw.get("examples"), + include_topology=kw.get("include_topology", True), + ) + return workflow + + def test_returns_prompt_source_with_topology(self) -> None: + from agent_framework_foundry._foundry_evals import workflow_as_eval_source + + workflow = self._make_workflow() + source = workflow_as_eval_source(workflow) + assert source.type == "prompt" + assert source.description == "Routes user questions." + assert source.prompt is not None + assert "Workflow name: demo-workflow" in source.prompt + assert "Topology (JSON):" in source.prompt + + def test_topology_can_be_disabled(self) -> None: + from agent_framework_foundry._foundry_evals import workflow_as_eval_source + + workflow = self._make_workflow() + source = workflow_as_eval_source(workflow, include_topology=False) + assert source.prompt is not None + assert "Topology (JSON):" not in source.prompt + + +class TestCoalesceGenerationSources: + """Validation for the source-resolution helper used by FoundryEvals.generate_rubric.""" + + def test_requires_exactly_one_source(self) -> None: + from agent_framework_foundry._foundry_evals import _coalesce_generation_sources + + with pytest.raises(ValueError, match="Provide one of"): + _coalesce_generation_sources(agent=None, workflow=None, sources=None) + + def test_rejects_multiple_sources(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _coalesce_generation_sources + + agent = MagicMock() + agent.name = "a" + agent.description = None + agent.default_options = {"instructions": "x", "tools": []} + agent.context_providers = [] + agent.mcp_tools = [] + with pytest.raises(ValueError, match="only one of"): + _coalesce_generation_sources( + agent=agent, + workflow=None, + sources=[EvalGenerationSource(type="prompt", prompt="hi")], + ) + + def test_uses_agent_helper_when_only_agent_supplied(self) -> None: + from agent_framework_foundry._foundry_evals import _coalesce_generation_sources + + agent = _make_stub_agent(name="alpha", description="An agent.") + + sources = _coalesce_generation_sources(agent=agent, workflow=None, sources=None) + assert len(sources) == 1 + assert sources[0].type == "prompt" + assert sources[0].prompt is not None + assert "Agent name: alpha" in sources[0].prompt + + def test_rejects_empty_sources_list(self) -> None: + from agent_framework_foundry._foundry_evals import _coalesce_generation_sources + + with pytest.raises(ValueError, match="at least one"): + _coalesce_generation_sources(agent=None, workflow=None, sources=[]) + + +class TestToSdkSource: + """Translation between EvalGenerationSource and SDK *JobSource types.""" + + def _make_sdk_types(self, *, with_agent: bool = True, with_dataset: bool = True, with_traces: bool = True) -> Any: + from agent_framework_foundry._foundry_evals import _GenerationSdkTypes + + return _GenerationSdkTypes( + EvaluatorGenerationInputs=MagicMock(), + EvaluatorGenerationJob=MagicMock(), + PromptSource=MagicMock(name="PromptSource"), + AgentSource=MagicMock(name="AgentSource") if with_agent else None, + DatasetSource=MagicMock(name="DatasetSource") if with_dataset else None, + TracesSource=MagicMock(name="TracesSource") if with_traces else None, + ) + + def test_prompt_source_is_translated(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + sdk.PromptSource.return_value = "prompt-sdk-instance" + out = _to_sdk_source( + EvalGenerationSource(type="prompt", prompt="hello", description="d"), + sdk, + ) + assert out == "prompt-sdk-instance" + sdk.PromptSource.assert_called_once_with(prompt="hello", description="d") + + def test_prompt_without_text_raises(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + with pytest.raises(ValueError, match="non-empty prompt"): + _to_sdk_source(EvalGenerationSource(type="prompt"), sdk) + + def test_agent_source_is_translated(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + sdk.AgentSource.return_value = "agent-sdk-instance" + out = _to_sdk_source( + EvalGenerationSource(type="agent", agent_name="my-hosted-agent"), + sdk, + ) + assert out == "agent-sdk-instance" + sdk.AgentSource.assert_called_once_with(agent_name="my-hosted-agent") + + def test_agent_source_requires_name(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + with pytest.raises(ValueError, match="agent_name"): + _to_sdk_source(EvalGenerationSource(type="agent"), sdk) + + def test_agent_source_raises_when_sdk_missing(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types(with_agent=False) + with pytest.raises(NotImplementedError, match="AgentEvaluatorGenerationJobSource"): + _to_sdk_source( + EvalGenerationSource(type="agent", agent_name="x"), + sdk, + ) + + def test_dataset_source_is_translated(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + sdk.DatasetSource.return_value = "dataset-sdk-instance" + out = _to_sdk_source( + EvalGenerationSource(type="dataset", dataset_name="ds", dataset_version="1"), + sdk, + ) + assert out == "dataset-sdk-instance" + sdk.DatasetSource.assert_called_once_with(dataset_name="ds", dataset_version="1") + + +class TestPollGenerationJob: + """Behavior of the rubric-generation polling loop.""" + + async def test_returns_immediately_on_succeeded(self) -> None: + from agent_framework_foundry._foundry_evals import _poll_generation_job + + evaluators_ops = MagicMock() + evaluators_ops.get_generation_job = AsyncMock() + job = MagicMock(id="job_1", status="succeeded") + out = await _poll_generation_job(evaluators_ops, job, poll_interval=0.01, timeout=1.0) + assert out is job + evaluators_ops.get_generation_job.assert_not_called() + + async def test_polls_until_terminal(self) -> None: + from agent_framework_foundry._foundry_evals import _poll_generation_job + + running = MagicMock(id="job_1", status="running") + succeeded = MagicMock(id="job_1", status="succeeded") + evaluators_ops = MagicMock() + evaluators_ops.get_generation_job = AsyncMock(side_effect=[running, succeeded]) + + initial = MagicMock(id="job_1", status="running") + out = await _poll_generation_job(evaluators_ops, initial, poll_interval=0.001, timeout=1.0) + assert out is succeeded + assert evaluators_ops.get_generation_job.await_count == 2 + + async def test_failed_status_raises(self) -> None: + from agent_framework_foundry._foundry_evals import _poll_generation_job + + err = MagicMock(message="boom") + terminal = MagicMock(id="job_1", status="failed", error=err) + evaluators_ops = MagicMock() + evaluators_ops.get_generation_job = AsyncMock(return_value=terminal) + + with pytest.raises(RuntimeError, match="boom"): + await _poll_generation_job( + evaluators_ops, + MagicMock(id="job_1", status="running"), + poll_interval=0.001, + timeout=1.0, + ) + + async def test_timeout_raises(self) -> None: + from agent_framework_foundry._foundry_evals import _poll_generation_job + + running = MagicMock(id="job_1", status="running") + evaluators_ops = MagicMock() + evaluators_ops.get_generation_job = AsyncMock(return_value=running) + + with pytest.raises(TimeoutError): + await _poll_generation_job(evaluators_ops, running, poll_interval=0.001, timeout=0.005) + + +class TestGenerationJobToRef: + """Translation of a completed generation job to a GeneratedEvaluatorRef.""" + + def test_builds_pinned_ref_with_dimensions(self) -> None: + from agent_framework_foundry._foundry_evals import RubricDimension, _generation_job_to_ref + + dim = MagicMock(id="d1", description="dim", weight=2, always_applicable=True) + definition = MagicMock(dimensions=[dim], pass_threshold=0.75) + evaluator = MagicMock( + name="my-eval", + version=3, + display_name="My Eval", + description="A custom rubric.", + definition=definition, + ) + evaluator.name = "my-eval" + job = MagicMock(artifacts=MagicMock(evaluator=evaluator)) + + ref = _generation_job_to_ref(job, category="quality") + assert ref.name == "my-eval" + assert ref.version == "3" + assert ref.display_name == "My Eval" + assert ref.description == "A custom rubric." + assert ref.category == "quality" + assert ref.pass_threshold == 0.75 + assert ref.dimensions is not None + assert ref.dimensions[0] == RubricDimension(id="d1", description="dim", weight=2, always_applicable=True) + + def test_missing_artifacts_raises(self) -> None: + from agent_framework_foundry._foundry_evals import _generation_job_to_ref + + job = MagicMock(artifacts=None) + with pytest.raises(RuntimeError, match="evaluator artifact"): + _generation_job_to_ref(job, category="quality") + + +class TestGenerateRubricSdkMissing: + """generate_rubric raises NotImplementedError when SDK lacks the rubric APIs.""" + + async def test_raises_when_sdk_types_unavailable(self, monkeypatch: pytest.MonkeyPatch) -> None: + from agent_framework_foundry import _foundry_evals as fm + from agent_framework_foundry._foundry_evals import EvalGenerationSource + + def _raise() -> Any: + raise fm._RubricSdkUnavailableError(fm._RUBRIC_SDK_MISSING_MSG) + + monkeypatch.setattr(fm, "_import_generation_sdk_types", _raise) + + project_client = MagicMock() + + with pytest.raises(NotImplementedError, match="rubric"): + await FoundryEvals.generate_rubric( + project_client=project_client, + name="my-eval", + sources=[EvalGenerationSource(type="prompt", prompt="hi")], + ) + + +class TestGenerateRubricE2E: + """End-to-end happy path for generate_rubric with mocked SDK.""" + + async def test_generate_rubric_from_agent(self, monkeypatch: pytest.MonkeyPatch) -> None: + from agent_framework_foundry import _foundry_evals as fm + + # Stub SDK type handles + prompt_cls = MagicMock(name="PromptSource") + prompt_cls.return_value = "sdk-prompt" + inputs_cls = MagicMock(name="EvaluatorGenerationInputs") + inputs_cls.return_value = "sdk-inputs" + job_cls = MagicMock(name="EvaluatorGenerationJob") + job_cls.return_value = "sdk-job" + + sdk_types = fm._GenerationSdkTypes( + EvaluatorGenerationInputs=inputs_cls, + EvaluatorGenerationJob=job_cls, + PromptSource=prompt_cls, + AgentSource=None, + DatasetSource=None, + TracesSource=None, + ) + monkeypatch.setattr(fm, "_import_generation_sdk_types", lambda: sdk_types) + + # Mock the SDK operations and completed job + completed_evaluator = MagicMock(version="7", display_name=None, description=None) + completed_evaluator.name = "agent-rubric" + completed_evaluator.definition = MagicMock(dimensions=[], pass_threshold=None) + completed = MagicMock( + id="job_42", + status="succeeded", + artifacts=MagicMock(evaluator=completed_evaluator), + ) + + evaluators_ops = MagicMock() + evaluators_ops.create_generation_job = AsyncMock(return_value=completed) + evaluators_ops.get_generation_job = AsyncMock(return_value=completed) + project_client = MagicMock() + project_client.beta = MagicMock(evaluators=evaluators_ops) + + # Build a stub agent + agent = _make_stub_agent( + name="weather-bot", + description="Looks up weather.", + instructions="Be brief.", + ) + + ref = await FoundryEvals.generate_rubric( + project_client=project_client, + name="agent-rubric", + agent=agent, + category="quality", + model="gpt-4o", + display_name="Display", + description="Desc", + operation_id="op-123", + ) + + assert ref.name == "agent-rubric" + assert ref.version == "7" + assert ref.category == "quality" + + # Verify inputs/job/source assembly + prompt_cls.assert_called_once() + prompt_kwargs = prompt_cls.call_args.kwargs + assert "Agent name: weather-bot" in prompt_kwargs["prompt"] + assert prompt_kwargs["description"] == "Looks up weather." + + inputs_cls.assert_called_once() + inputs_kwargs = inputs_cls.call_args.kwargs + assert inputs_kwargs["name"] == "agent-rubric" + assert inputs_kwargs["category"] == "quality" + assert inputs_kwargs["model"] == "gpt-4o" + assert inputs_kwargs["display_name"] == "Display" + assert inputs_kwargs["description"] == "Desc" + assert inputs_kwargs["sources"] == ["sdk-prompt"] + + job_cls.assert_called_once_with(inputs="sdk-inputs") + evaluators_ops.create_generation_job.assert_awaited_once_with(job="sdk-job", operation_id="op-123") From 4c7f94f665562860d660c707d2ad5418db96c08d Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Wed, 27 May 2026 08:22:00 -0700 Subject: [PATCH 06/16] Python: feat(foundry-evals): YAML config loader + sample Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../agent_framework_foundry/__init__.py | 12 + .../agent_framework_foundry/_evals_config.py | 403 ++++++++++++++++++ .../foundry/tests/test_evals_config.py | 273 ++++++++++++ .../evaluate_with_generated_rubric_sample.py | 151 +++++++ .../evaluation/foundry_evals/evaluators.yaml | 11 + 5 files changed, 850 insertions(+) create mode 100644 python/packages/foundry/agent_framework_foundry/_evals_config.py create mode 100644 python/packages/foundry/tests/test_evals_config.py create mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py create mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py index cafe30eb955..efbe0b8d248 100644 --- a/python/packages/foundry/agent_framework_foundry/__init__.py +++ b/python/packages/foundry/agent_framework_foundry/__init__.py @@ -10,6 +10,13 @@ FoundryEmbeddingSettings, RawFoundryEmbeddingClient, ) +from ._evals_config import ( + RubricGenerationSpec, + RubricSourceSpec, + build_sources, + load_evals_config, + parse_evals_config, +) from ._foundry_evals import ( EvalGenerationSource, FoundryEvals, @@ -44,9 +51,14 @@ "RawFoundryChatClient", "RawFoundryEmbeddingClient", "RubricDimension", + "RubricGenerationSpec", + "RubricSourceSpec", "__version__", "agent_as_eval_source", + "build_sources", "evaluate_foundry_target", "evaluate_traces", + "load_evals_config", + "parse_evals_config", "workflow_as_eval_source", ] diff --git a/python/packages/foundry/agent_framework_foundry/_evals_config.py b/python/packages/foundry/agent_framework_foundry/_evals_config.py new file mode 100644 index 00000000000..5f45e2854b8 --- /dev/null +++ b/python/packages/foundry/agent_framework_foundry/_evals_config.py @@ -0,0 +1,403 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""YAML-driven evaluator configuration for rubric generation and evaluation. + +Defines the source-controlled config schema described in +``adaptive-evals-draft.md``: a list of named rubric-generation specs that +CI jobs and harnesses parse to drive +:meth:`FoundryEvals.generate_rubric`. + +Example config: + +.. code-block:: yaml + + evaluators: + reservation-agent-quality: + type: foundry.generated_rubric + category: quality + model: gpt-4o + agent: reservation-agent + sources: + - type: agent + include_instructions: true + include_tools: true + - type: dataset + name: reservation-business-rules + version: "1" + +Example loader usage: + +.. code-block:: python + + from agent_framework_foundry import load_evals_config, FoundryEvals + + config = load_evals_config("evaluators.yaml") + spec = config["reservation-agent-quality"] + sources = build_sources(spec, agent=agent) + ref = await FoundryEvals.generate_rubric( + project_client=client, + name=spec.name, + sources=sources, + category=spec.category, + model=spec.model, + display_name=spec.display_name, + description=spec.description, + ) +""" + +from __future__ import annotations + +import os +from collections.abc import Mapping +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Literal, cast + +from agent_framework._feature_stage import ExperimentalFeature, experimental + +from ._foundry_evals import ( + EvalGenerationSource, + agent_as_eval_source, + workflow_as_eval_source, +) + +_RUBRIC_TYPE = "foundry.generated_rubric" + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class RubricSourceSpec: + """A single source entry in a :class:`RubricGenerationSpec` ``sources`` list. + + Mirrors the per-source YAML schema. The :attr:`type` field is the + discriminator; only the fields relevant to each type are read. + + Attributes: + type: One of ``"agent"``, ``"workflow"``, ``"prompt"``, + ``"dataset"``, ``"traces"``. + description: Optional description shown in Foundry UI. + include_instructions: Whether to include the bound agent / + workflow's instructions. Applies to ``"agent"`` and + ``"workflow"`` types. + include_tools: Whether to include the bound agent / workflow's + tools. Applies to ``"agent"`` and ``"workflow"`` types. + include_context_providers: Whether to include attached + context-provider class names. Applies to ``"agent"`` and + ``"workflow"`` types. + include_examples: Whether to include ``examples``. Applies to + ``"agent"`` and ``"workflow"`` types. + include_topology: Whether to include the JSON-encoded topology. + Applies to ``"workflow"`` type. + examples: Optional list of example queries for ``"agent"`` / + ``"workflow"`` sources. + prompt: Rendered dossier for ``"prompt"`` type. + agent_name: Hosted Foundry agent name for ``"agent"`` type with + a server-side reference. + name: Dataset name for ``"dataset"`` type. + version: Pinned dataset version. + metadata: Free-form metadata for ``"traces"`` sources. + """ + + type: Literal["agent", "workflow", "prompt", "dataset", "traces"] + description: str | None = None + include_instructions: bool = True + include_tools: bool = True + include_context_providers: bool = False + include_examples: bool = False + include_topology: bool = True + examples: tuple[str, ...] = field(default_factory=tuple) + prompt: str | None = None + agent_name: str | None = None + name: str | None = None + version: str | None = None + metadata: dict[str, Any] | None = None + + +@experimental(feature_id=ExperimentalFeature.EVALS) +@dataclass(frozen=True) +class RubricGenerationSpec: + """A single named entry from an evaluators YAML config. + + Attributes: + name: Evaluator name (the YAML key under ``evaluators``). + type: Discriminator literal. Must be + ``"foundry.generated_rubric"`` for rubric evaluators. + category: ``"quality"`` or ``"safety"``. + model: Optional model deployment to drive generation. + agent: Optional symbolic reference to the agent in the + caller's harness. Resolved by user code into a + :class:`BaseAgent` and passed to + :func:`build_sources`. + workflow: Optional symbolic reference to a workflow. + display_name: Optional human-readable name. + description: Optional description. + sources: List of source specs to feed into generation. When + empty, callers typically default to a single + ``RubricSourceSpec(type='agent')`` or + ``RubricSourceSpec(type='workflow')`` source. + """ + + name: str + type: str = _RUBRIC_TYPE + category: Literal["quality", "safety"] = "quality" + model: str | None = None + agent: str | None = None + workflow: str | None = None + display_name: str | None = None + description: str | None = None + sources: tuple[RubricSourceSpec, ...] = field(default_factory=tuple) + + +@experimental(feature_id=ExperimentalFeature.EVALS) +def load_evals_config(path: str | os.PathLike[str]) -> dict[str, RubricGenerationSpec]: + """Load a YAML evaluators config and return a name -> spec mapping. + + Reads ``path`` (UTF-8) and parses the top-level ``evaluators`` + mapping into :class:`RubricGenerationSpec` instances keyed by name. + + Requires ``PyYAML``. Raises :class:`ImportError` with a helpful + message when PyYAML is not installed. + + Args: + path: Filesystem path to the YAML config. + + Returns: + A dict mapping evaluator name to :class:`RubricGenerationSpec`. + + Raises: + ImportError: If PyYAML is not installed. + ValueError: If the YAML file is malformed. + """ + try: + import yaml # type: ignore[import-untyped] + except ImportError as exc: + raise ImportError("load_evals_config requires PyYAML. Install with `pip install pyyaml`.") from exc + + raw = yaml.safe_load(Path(path).read_text(encoding="utf-8")) + return parse_evals_config(raw) + + +@experimental(feature_id=ExperimentalFeature.EVALS) +def parse_evals_config(data: Any) -> dict[str, RubricGenerationSpec]: + """Parse an already-loaded YAML mapping into rubric-generation specs. + + Useful when callers manage YAML loading themselves (e.g. CI that + interpolates env vars before parsing). + + Args: + data: A mapping with an ``"evaluators"`` key containing a mapping + of evaluator names to spec dicts. + + Returns: + A dict mapping evaluator name to :class:`RubricGenerationSpec`. + + Raises: + ValueError: If the structure is malformed. + """ + if not isinstance(data, Mapping): + raise ValueError("Evaluators config must be a mapping.") + data_map = cast("Mapping[str, Any]", data) + raw_evaluators = data_map.get("evaluators") + if raw_evaluators is None: + raise ValueError("Evaluators config is missing a top-level 'evaluators' key.") + if not isinstance(raw_evaluators, Mapping): + raise ValueError("Evaluators config 'evaluators' entry must be a mapping.") + evaluators = cast("Mapping[str, Any]", raw_evaluators) + + parsed: dict[str, RubricGenerationSpec] = {} + for name, raw in evaluators.items(): + if not isinstance(raw, Mapping): + raise ValueError(f"Evaluator entry {name!r} must be a mapping, got {type(raw).__name__}.") + raw_map = cast("Mapping[str, Any]", raw) + parsed[name] = _parse_spec(name, raw_map) + return parsed + + +def _parse_spec(name: str, raw: Mapping[str, Any]) -> RubricGenerationSpec: + type_value = raw.get("type", _RUBRIC_TYPE) + if type_value != _RUBRIC_TYPE: + raise ValueError(f"Evaluator {name!r} has unsupported type {type_value!r}; expected {_RUBRIC_TYPE!r}.") + category = raw.get("category", "quality") + if category not in ("quality", "safety"): + raise ValueError(f"Evaluator {name!r} has invalid category {category!r}; expected 'quality' or 'safety'.") + + raw_sources_obj: Any = raw.get("sources") or () + if not isinstance(raw_sources_obj, (list, tuple)): + raise ValueError(f"Evaluator {name!r} 'sources' must be a list.") + sources_iter: list[Any] = list(cast("Any", raw_sources_obj)) + sources: list[RubricSourceSpec] = [] + for index, raw_source in enumerate(sources_iter): + if not isinstance(raw_source, Mapping): + raise ValueError( + f"Evaluator {name!r} source entry {index} must be a mapping, got {type(raw_source).__name__}." + ) + sources.append(_parse_source(name, index, cast("Mapping[str, Any]", raw_source))) + + return RubricGenerationSpec( + name=name, + type=type_value, + category=category, + model=raw.get("model"), + agent=raw.get("agent"), + workflow=raw.get("workflow"), + display_name=raw.get("display_name"), + description=raw.get("description"), + sources=tuple(sources), + ) + + +def _parse_source(spec_name: str, index: int, raw: Mapping[str, Any]) -> RubricSourceSpec: + type_value = raw.get("type") + if type_value not in ("agent", "workflow", "prompt", "dataset", "traces"): + raise ValueError( + f"Evaluator {spec_name!r} source {index} has invalid type {type_value!r}; " + "expected one of 'agent', 'workflow', 'prompt', 'dataset', 'traces'." + ) + + examples_raw: Any = raw.get("examples") or () + if not isinstance(examples_raw, (list, tuple)): + raise ValueError(f"Evaluator {spec_name!r} source {index} 'examples' must be a list.") + examples_iter: list[Any] = list(cast("Any", examples_raw)) + examples = tuple(str(e) for e in examples_iter) + + metadata_raw = raw.get("metadata") + if metadata_raw is not None and not isinstance(metadata_raw, Mapping): + raise ValueError(f"Evaluator {spec_name!r} source {index} 'metadata' must be a mapping.") + + return RubricSourceSpec( + type=cast("Any", type_value), + description=raw.get("description"), + include_instructions=bool(raw.get("include_instructions", True)), + include_tools=bool(raw.get("include_tools", True)), + include_context_providers=bool(raw.get("include_context_providers", False)), + include_examples=bool(raw.get("include_examples", False)), + include_topology=bool(raw.get("include_topology", True)), + examples=examples, + prompt=raw.get("prompt"), + agent_name=raw.get("agent_name"), + name=raw.get("name"), + version=str(raw.get("version")) if raw.get("version") is not None else None, + metadata=dict(cast("Mapping[str, Any]", metadata_raw)) if metadata_raw is not None else None, + ) + + +@experimental(feature_id=ExperimentalFeature.EVALS) +def build_sources( + spec: RubricGenerationSpec, + *, + agent: Any | None = None, + workflow: Any | None = None, +) -> list[EvalGenerationSource]: + """Translate a spec's source list into :class:`EvalGenerationSource` instances. + + Resolves each :class:`RubricSourceSpec` against the supplied + ``agent`` and ``workflow`` instances: + + * ``type='agent'`` sources call :func:`agent_as_eval_source` with + the spec's include-flags. If the source carries an + ``agent_name`` the agent is referenced server-side instead. + * ``type='workflow'`` sources call + :func:`workflow_as_eval_source` with the spec's include-flags. + * ``type='prompt'``, ``type='dataset'``, and ``type='traces'`` + sources are translated directly into + :class:`EvalGenerationSource` instances without consulting the + runtime agent or workflow. + + When the spec has no ``sources`` entries, defaults to a single + ``type='agent'`` source when an ``agent`` is provided, or a single + ``type='workflow'`` source when a ``workflow`` is provided. + + Args: + spec: Parsed :class:`RubricGenerationSpec`. + agent: Optional agent instance for ``type='agent'`` sources. + workflow: Optional workflow instance for ``type='workflow'`` + sources. + + Returns: + A list of :class:`EvalGenerationSource` instances ready to pass + to :meth:`FoundryEvals.generate_rubric` as ``sources=``. + + Raises: + ValueError: If a source references an agent or workflow that + was not supplied. + """ + if not spec.sources: + if agent is not None: + return [agent_as_eval_source(agent)] + if workflow is not None: + return [workflow_as_eval_source(workflow)] + raise ValueError(f"Spec {spec.name!r} has no sources and no agent/workflow was provided to build_sources().") + + out: list[EvalGenerationSource] = [] + for src in spec.sources: + if src.type == "agent": + if src.agent_name: + out.append( + EvalGenerationSource( + type="agent", + agent_name=src.agent_name, + description=src.description, + ) + ) + continue + if agent is None: + raise ValueError(f"Spec {spec.name!r} has a source of type 'agent' but no agent= was provided.") + out.append( + agent_as_eval_source( + agent, + include_instructions=src.include_instructions, + include_tools=src.include_tools, + include_context_providers=src.include_context_providers, + include_examples=src.include_examples, + examples=list(src.examples) if src.examples else None, + ) + ) + elif src.type == "workflow": + if workflow is None: + raise ValueError(f"Spec {spec.name!r} has a source of type 'workflow' but no workflow= was provided.") + out.append( + workflow_as_eval_source( + workflow, + include_instructions=src.include_instructions, + include_tools=src.include_tools, + include_context_providers=src.include_context_providers, + include_examples=src.include_examples, + examples=list(src.examples) if src.examples else None, + include_topology=src.include_topology, + ) + ) + elif src.type == "prompt": + if not src.prompt: + raise ValueError(f"Spec {spec.name!r} has a 'prompt' source missing the 'prompt' field.") + out.append(EvalGenerationSource(type="prompt", prompt=src.prompt, description=src.description)) + elif src.type == "dataset": + if not src.name: + raise ValueError(f"Spec {spec.name!r} has a 'dataset' source missing the 'name' field.") + out.append( + EvalGenerationSource( + type="dataset", + dataset_name=src.name, + dataset_version=src.version, + description=src.description, + ) + ) + elif src.type == "traces": + out.append( + EvalGenerationSource( + type="traces", + description=src.description, + metadata=src.metadata, + ) + ) + else: # pragma: no cover - guarded by _parse_source + raise ValueError(f"Spec {spec.name!r} has unknown source type {src.type!r}.") + return out + + +__all__ = [ + "RubricGenerationSpec", + "RubricSourceSpec", + "build_sources", + "load_evals_config", + "parse_evals_config", +] diff --git a/python/packages/foundry/tests/test_evals_config.py b/python/packages/foundry/tests/test_evals_config.py new file mode 100644 index 00000000000..a1c86187d47 --- /dev/null +++ b/python/packages/foundry/tests/test_evals_config.py @@ -0,0 +1,273 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Tests for the YAML-driven evaluator configuration loader.""" + +from __future__ import annotations + +import textwrap +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock + +import pytest + +from agent_framework_foundry._evals_config import ( + RubricGenerationSpec, + RubricSourceSpec, + build_sources, + load_evals_config, + parse_evals_config, +) +from agent_framework_foundry._foundry_evals import EvalGenerationSource + + +def _make_agent(name: str = "agent-a", instructions: str = "Be brief.") -> Any: + from agent_framework._evaluation import _render_agent_dossier + + agent = MagicMock() + agent.name = name + agent.description = f"{name} description" + agent.default_options = {"instructions": instructions, "tools": []} + agent.context_providers = [] + agent.mcp_tools = [] + agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier( + agent, + include_instructions=kw.get("include_instructions", True), + include_tools=kw.get("include_tools", True), + include_context_providers=kw.get("include_context_providers", False), + include_examples=kw.get("include_examples", False), + examples=kw.get("examples"), + ) + return agent + + +def _make_workflow() -> Any: + from agent_framework._evaluation import _render_workflow_dossier + + workflow = MagicMock() + workflow.name = "wf-1" + workflow.description = "demo" + workflow.to_dict.return_value = {"name": "wf-1", "id": "wf_1", "executors": {}, "edge_groups": []} + workflow.executors = {} + workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier( + workflow, + include_instructions=kw.get("include_instructions", True), + include_tools=kw.get("include_tools", True), + include_context_providers=kw.get("include_context_providers", False), + include_examples=kw.get("include_examples", False), + examples=kw.get("examples"), + include_topology=kw.get("include_topology", True), + ) + return workflow + + +class TestParseEvalsConfig: + """Parsing already-loaded dicts into RubricGenerationSpec instances.""" + + def test_minimal_spec(self) -> None: + config = parse_evals_config({ + "evaluators": { + "my-rubric": { + "type": "foundry.generated_rubric", + } + } + }) + assert "my-rubric" in config + spec = config["my-rubric"] + assert spec.name == "my-rubric" + assert spec.type == "foundry.generated_rubric" + assert spec.category == "quality" + assert spec.sources == () + + def test_full_spec_with_sources(self) -> None: + config = parse_evals_config({ + "evaluators": { + "reservation-quality": { + "type": "foundry.generated_rubric", + "category": "quality", + "model": "gpt-4o", + "agent": "reservation-agent", + "display_name": "Reservation Quality", + "description": "Custom rubric for reservation agent.", + "sources": [ + { + "type": "agent", + "include_instructions": True, + "include_tools": True, + "include_context_providers": True, + }, + { + "type": "dataset", + "name": "reservation-business-rules", + "version": 1, + }, + ], + } + } + }) + spec = config["reservation-quality"] + assert spec.model == "gpt-4o" + assert spec.agent == "reservation-agent" + assert spec.display_name == "Reservation Quality" + assert len(spec.sources) == 2 + + agent_src = spec.sources[0] + assert agent_src.type == "agent" + assert agent_src.include_context_providers is True + + dataset_src = spec.sources[1] + assert dataset_src.type == "dataset" + assert dataset_src.name == "reservation-business-rules" + assert dataset_src.version == "1" # coerced to string + + def test_rejects_non_mapping(self) -> None: + with pytest.raises(ValueError, match="must be a mapping"): + parse_evals_config([]) + + def test_rejects_missing_evaluators_key(self) -> None: + with pytest.raises(ValueError, match="evaluators"): + parse_evals_config({"other": {}}) + + def test_rejects_unknown_type(self) -> None: + with pytest.raises(ValueError, match="unsupported type"): + parse_evals_config({"evaluators": {"x": {"type": "foundry.other"}}}) + + def test_rejects_invalid_category(self) -> None: + with pytest.raises(ValueError, match="invalid category"): + parse_evals_config({"evaluators": {"x": {"type": "foundry.generated_rubric", "category": "bogus"}}}) + + def test_rejects_invalid_source_type(self) -> None: + with pytest.raises(ValueError, match="invalid type"): + parse_evals_config({ + "evaluators": { + "x": { + "type": "foundry.generated_rubric", + "sources": [{"type": "bogus"}], + } + } + }) + + +class TestLoadEvalsConfig: + """End-to-end YAML loading.""" + + def test_load_from_yaml_file(self, tmp_path: Path) -> None: + pytest.importorskip("yaml") + config_path = tmp_path / "evals.yaml" + config_path.write_text( + textwrap.dedent( + """\ + evaluators: + my-eval: + type: foundry.generated_rubric + category: safety + model: gpt-4o-mini + sources: + - type: prompt + prompt: "Score the response." + """ + ), + encoding="utf-8", + ) + config = load_evals_config(config_path) + assert "my-eval" in config + spec = config["my-eval"] + assert spec.category == "safety" + assert spec.model == "gpt-4o-mini" + assert len(spec.sources) == 1 + assert spec.sources[0].type == "prompt" + assert spec.sources[0].prompt == "Score the response." + + +class TestBuildSources: + """Translate RubricGenerationSpec sources into EvalGenerationSource instances.""" + + def test_no_sources_with_agent_default(self) -> None: + spec = RubricGenerationSpec(name="x") + agent = _make_agent() + sources = build_sources(spec, agent=agent) + assert len(sources) == 1 + assert sources[0].type == "prompt" + assert sources[0].prompt is not None + assert "Agent name: agent-a" in sources[0].prompt + + def test_no_sources_with_workflow_default(self) -> None: + spec = RubricGenerationSpec(name="x") + workflow = _make_workflow() + sources = build_sources(spec, workflow=workflow) + assert len(sources) == 1 + assert sources[0].type == "prompt" + assert sources[0].prompt is not None + assert "Workflow name: wf-1" in sources[0].prompt + + def test_no_sources_no_agent_or_workflow_raises(self) -> None: + spec = RubricGenerationSpec(name="x") + with pytest.raises(ValueError, match="no sources"): + build_sources(spec) + + def test_agent_source_uses_supplied_agent(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="agent", include_context_providers=True),), + ) + agent = _make_agent() + sources = build_sources(spec, agent=agent) + assert sources[0].type == "prompt" + assert sources[0].prompt is not None + assert "Agent name: agent-a" in sources[0].prompt + + def test_agent_source_with_agent_name_uses_hosted_path(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="agent", agent_name="hosted-foundry-agent"),), + ) + sources = build_sources(spec) + assert sources[0].type == "agent" + assert sources[0].agent_name == "hosted-foundry-agent" + + def test_agent_source_without_agent_raises(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="agent"),), + ) + with pytest.raises(ValueError, match="no agent="): + build_sources(spec) + + def test_workflow_source_uses_supplied_workflow(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="workflow", include_topology=False),), + ) + workflow = _make_workflow() + sources = build_sources(spec, workflow=workflow) + assert sources[0].type == "prompt" + assert sources[0].prompt is not None + assert "Workflow name: wf-1" in sources[0].prompt + assert "Topology (JSON):" not in sources[0].prompt + + def test_prompt_source_translates_directly(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="prompt", prompt="Score it."),), + ) + sources = build_sources(spec) + assert sources[0] == EvalGenerationSource(type="prompt", prompt="Score it.") + + def test_dataset_source_translates(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="dataset", name="ds", version="2"),), + ) + sources = build_sources(spec) + assert sources[0].type == "dataset" + assert sources[0].dataset_name == "ds" + assert sources[0].dataset_version == "2" + + def test_traces_source_passes_metadata(self) -> None: + spec = RubricGenerationSpec( + name="x", + sources=(RubricSourceSpec(type="traces", metadata={"environment": "prod"}),), + ) + sources = build_sources(spec) + assert sources[0].type == "traces" + assert sources[0].metadata == {"environment": "prod"} diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py new file mode 100644 index 00000000000..9c19ff552ba --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py @@ -0,0 +1,151 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Generate a Foundry rubric evaluator from an agent and use it in CI. + +This sample demonstrates the end-to-end adaptive-evals flow: + +1. Build an agent. +2. Generate a rubric evaluator from the agent using + ``FoundryEvals.generate_rubric()`` — produces a pinned + ``GeneratedEvaluatorRef`` you can store in source control. +3. Use the pinned reference in ``evaluators=[...]`` for a regression + run alongside built-in evaluators. +4. Assert quality gates with ``assert_score_at_least`` / + ``assert_dimension_score_at_least`` / ``assert_no_failed_items``. + +A companion ``evaluators.yaml`` shows the source-controlled config +pattern for CI. Load it with :func:`load_evals_config` and pass the +resulting spec through :func:`build_sources` to keep generation +parameters out of code. + +Prerequisites: +- An Azure AI Foundry project with a deployed model. +- ``azure-ai-projects`` build that includes the rubric-generation APIs. +- Set ``FOUNDRY_PROJECT_ENDPOINT`` and ``FOUNDRY_MODEL`` in ``.env``. + +Run with: + +.. code-block:: bash + + az login + python evaluate_with_generated_rubric_sample.py +""" + +import asyncio +import os +import textwrap +from pathlib import Path + +from agent_framework import evaluate_agent +from agent_framework.foundry import ( + FoundryChatClient, + FoundryEvals, + build_sources, + load_evals_config, +) +from azure.ai.projects.aio import AIProjectClient +from azure.identity.aio import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + + +def get_weather(location: str) -> str: + """Get the current weather for a location.""" + samples = { + "seattle": "62F, cloudy with a chance of rain", + "london": "55F, overcast", + "paris": "68F, partly sunny", + } + return samples.get(location.lower(), f"Weather data not available for {location}") + + +SAMPLE_YAML = textwrap.dedent( + """\ + evaluators: + travel-quality: + type: foundry.generated_rubric + category: quality + model: gpt-4o + display_name: Travel Quality Rubric + description: Custom rubric tailored to the travel-assistant agent. + sources: + - type: agent + include_instructions: true + include_tools: true + """ +) + + +async def main() -> None: + project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] + model_name = os.environ.get("FOUNDRY_MODEL", "gpt-4o") + + credential = AzureCliCredential() + chat_client = FoundryChatClient( + project_endpoint=project_endpoint, + model=model_name, + credential=credential, + ) + project_client = AIProjectClient(endpoint=project_endpoint, credential=credential) + + agent = chat_client.as_agent( + name="travel-assistant", + instructions=( + "You are a helpful travel assistant. Always ground recommendations in tool output, " + "cite each tool result, and refuse questions outside travel planning." + ), + tools=[get_weather], + ) + + # 1. Load the source-controlled evaluator config. + config_path = Path(__file__).with_name("evaluators.yaml") + if not config_path.exists(): + config_path.write_text(SAMPLE_YAML, encoding="utf-8") + print(f"Wrote sample config to {config_path}") + config = load_evals_config(config_path) + spec = config["travel-quality"] + + # 2. Generate (or refresh) the rubric evaluator. In CI you typically run + # this once and commit the returned name/version pair. + print("Generating rubric evaluator from agent + spec...") + sources = build_sources(spec, agent=agent) + rubric_ref = await FoundryEvals.generate_rubric( + project_client=project_client, + name=spec.name, + sources=sources, + category=spec.category, + model=spec.model, + display_name=spec.display_name, + description=spec.description, + ) + print(f"Generated rubric {rubric_ref.name}@{rubric_ref.version} with {len(rubric_ref.dimensions or ())} dimensions") + + # 3. Run an evaluation that combines built-ins with the new rubric. + evals = FoundryEvals( + client=chat_client, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY, rubric_ref], + ) + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather in Seattle?", + "Should I pack an umbrella for London?", + ], + evaluators=evals, + ) + + # 4. Quality gates — wire these into your CI job's exit status. + for r in results: + print(f"\nRun {r.run_id}: {r.passed}/{r.total} passed; portal: {r.report_url}") + r.assert_no_failed_items() + r.assert_score_at_least(0.8) + if rubric_ref.dimensions: + r.assert_dimension_score_at_least(rubric_ref.dimensions[0].id, 3) + + await project_client.close() + await credential.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml new file mode 100644 index 00000000000..f3e698c77ce --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml @@ -0,0 +1,11 @@ +evaluators: + travel-quality: + type: foundry.generated_rubric + category: quality + model: gpt-4o + display_name: Travel Quality Rubric + description: Custom rubric tailored to the travel-assistant agent. + sources: + - type: agent + include_instructions: true + include_tools: true From 276fb769abe2d8832c16a0c3aafb9610adda896c Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Wed, 27 May 2026 08:48:15 -0700 Subject: [PATCH 07/16] Python: fix(evals): address PR review feedback Addresses 4 Copilot review comments on PR #6101: 1. assert_dimension_score_at_least: drop the (not evaluator or found_any) guard so require_applicable=True correctly raises when the named evaluator produces no entries for the dimension. Adds TestRubricAssertions covering the regression. 2. GeneratedEvaluatorRef docstring: reword to describe actual behaviour (pinning recommended, not required) so it matches the dataclass default and FoundryEvals warning path. 3. _poll_generation_job: switch from asyncio.get_event_loop() to get_running_loop() and bound the per-iteration sleep by remaining time, matching _poll_eval_run. 4. generate_rubric: type category as Literal['quality','safety'] and validate at the entry point with a ValueError; drop the silent 'invalid -> quality' rewrite in _generation_job_to_ref. Adds a regression test. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../core/agent_framework/_evaluation.py | 4 +- .../core/tests/core/test_local_eval.py | 106 ++++++++++++++++++ .../agent_framework_foundry/_foundry_evals.py | 28 +++-- .../foundry/tests/test_foundry_evals.py | 16 ++- 4 files changed, 138 insertions(+), 16 deletions(-) diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py index 48704d3543c..b14bdee9b22 100644 --- a/python/packages/core/agent_framework/_evaluation.py +++ b/python/packages/core/agent_framework/_evaluation.py @@ -577,7 +577,6 @@ def assert_dimension_score_at_least( def _check(results: EvalResults) -> None: for item in results.items: found_applicable = False - found_any = False for score in item.scores: if evaluator is not None and score.name != evaluator: continue @@ -586,7 +585,6 @@ def _check(results: EvalResults) -> None: for rs in score.dimensions: if rs.id != dimension_id: continue - found_any = True if not rs.applicable: continue found_applicable = True @@ -595,7 +593,7 @@ def _check(results: EvalResults) -> None: f"{item.item_id}/{score.name}/{dimension_id}=" f"{rs.score if rs.score is not None else 'None'}" ) - if require_applicable and not found_applicable and (not evaluator or found_any): + if require_applicable and not found_applicable: missing_items.append(item.item_id) for sub in results.sub_results.values(): _check(sub) diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py index c13b107c4bd..e4c37dfb4b4 100644 --- a/python/packages/core/tests/core/test_local_eval.py +++ b/python/packages/core/tests/core/test_local_eval.py @@ -12,8 +12,13 @@ from agent_framework._evaluation import ( CheckResult, EvalItem, + EvalItemResult, + EvalNotPassedError, + EvalResults, + EvalScoreResult, ExpectedToolCall, LocalEvaluator, + RubricScore, _coerce_result, evaluator, keyword_check, @@ -1010,6 +1015,107 @@ def test_all_passed_parent_fails_when_own_counts_fail(self): assert parent.all_passed is False +# --------------------------------------------------------------------------- +# Rubric assertions (EvalResults.assert_*) +# --------------------------------------------------------------------------- + + +def _rubric_results(*scores_per_item: list[EvalScoreResult]) -> EvalResults: + items = [ + EvalItemResult(item_id=f"item-{i}", status="pass", scores=scores) for i, scores in enumerate(scores_per_item) + ] + return EvalResults( + provider="test", + eval_id="ev1", + run_id="run1", + result_counts={"passed": len(items), "failed": 0, "errored": 0, "total": len(items)}, + items=items, + ) + + +class TestRubricAssertions: + """Tests for EvalResults.assert_dimension_score_at_least.""" + + def test_dimension_at_or_above_threshold_passes(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="policy", + score=0.9, + dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")], + ) + ], + ) + # Should not raise. + results.assert_dimension_score_at_least("clarity", 3) + + def test_dimension_below_threshold_raises(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="policy", + score=0.5, + dimensions=[RubricScore(id="clarity", score=2, applicable=True, weight=1, reason="")], + ) + ], + ) + with pytest.raises(EvalNotPassedError): + results.assert_dimension_score_at_least("clarity", 3) + + def test_non_applicable_skipped_by_default(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="policy", + score=1.0, + dimensions=[RubricScore(id="clarity", score=None, applicable=False, weight=1, reason="n/a")], + ) + ], + ) + # No applicable scores; default behaviour is to skip silently. + results.assert_dimension_score_at_least("clarity", 3) + + def test_require_applicable_raises_when_dimension_absent(self) -> None: + results = _rubric_results( + [EvalScoreResult(name="policy", score=1.0, dimensions=[])], + ) + with pytest.raises(EvalNotPassedError, match="not applicable"): + results.assert_dimension_score_at_least("clarity", 3, require_applicable=True) + + def test_require_applicable_raises_when_filtered_evaluator_missing(self) -> None: + # Regression: previously the (not evaluator or found_any) guard caused + # this case to silently pass even with require_applicable=True. + results = _rubric_results( + [ + EvalScoreResult( + name="other", + score=0.9, + dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")], + ) + ], + ) + with pytest.raises(EvalNotPassedError, match="not applicable"): + results.assert_dimension_score_at_least("clarity", 3, evaluator="policy", require_applicable=True) + + def test_evaluator_filter_isolates_offenders(self) -> None: + results = _rubric_results( + [ + EvalScoreResult( + name="other", + score=0.1, + dimensions=[RubricScore(id="clarity", score=1, applicable=True, weight=1, reason="")], + ), + EvalScoreResult( + name="policy", + score=0.9, + dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")], + ), + ], + ) + # The low-scoring "other" evaluator is filtered out; "policy" passes. + results.assert_dimension_score_at_least("clarity", 3, evaluator="policy") + + # --------------------------------------------------------------------------- # Eval source rendering (string dossiers) # --------------------------------------------------------------------------- diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index 0d83d8b1bc3..5241c4d268f 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -96,9 +96,11 @@ class GeneratedEvaluatorRef: when the evaluator already exists, or obtain one from :meth:`FoundryEvals.generate_rubric`. - By default ``version`` is required and pinned so an evaluation run is - reproducible. Use :meth:`latest` to opt in to versionless references - explicitly. + Pinning ``version`` is strongly recommended so evaluation runs are + reproducible. The dataclass accepts ``version=None`` for the + convenience of :meth:`latest`, but ``FoundryEvals`` emits a warning + whenever a versionless reference is used; CI gates should always + pass a concrete version. Attributes: name: Evaluator name as stored in the Foundry project (e.g. @@ -1147,7 +1149,7 @@ async def generate_rubric( agent: BaseAgent | None = None, workflow: Workflow | None = None, sources: Sequence[EvalGenerationSource] | None = None, - category: str = "quality", + category: Literal["quality", "safety"] = "quality", model: str | None = None, display_name: str | None = None, description: str | None = None, @@ -1209,6 +1211,9 @@ async def generate_rubric( """ resolved_sources = _coalesce_generation_sources(agent=agent, workflow=workflow, sources=sources) + if category not in ("quality", "safety"): + raise ValueError(f"category must be 'quality' or 'safety', got {category!r}.") + try: sdk_types = _import_generation_sdk_types() except _RubricSdkUnavailableError as exc: @@ -1389,7 +1394,8 @@ async def _poll_generation_job( if not job_id: raise RuntimeError("Rubric generation job did not return an id.") - deadline = asyncio.get_event_loop().time() + timeout + loop = asyncio.get_running_loop() + deadline = loop.time() + timeout current = job while True: status = (getattr(current, "status", "") or "").lower() @@ -1399,15 +1405,16 @@ async def _poll_generation_job( err_msg = getattr(err, "message", None) or str(err) if err is not None else status raise RuntimeError(f"Rubric generation job {job_id} ended in status {status!r}: {err_msg}") return current - if asyncio.get_event_loop().time() >= deadline: + remaining = deadline - loop.time() + if remaining <= 0: raise TimeoutError( f"Rubric generation job {job_id} did not complete within {timeout}s (last status: {status!r})." ) - await asyncio.sleep(poll_interval) + await asyncio.sleep(min(poll_interval, remaining)) current = await evaluators_ops.get_generation_job(job_id) -def _generation_job_to_ref(job: Any, *, category: str) -> GeneratedEvaluatorRef: +def _generation_job_to_ref(job: Any, *, category: Literal["quality", "safety"]) -> GeneratedEvaluatorRef: """Build a pinned :class:`GeneratedEvaluatorRef` from a completed job.""" artifacts: Any = getattr(job, "artifacts", None) evaluator: Any = getattr(artifacts, "evaluator", None) if artifacts is not None else None @@ -1447,13 +1454,10 @@ def _generation_job_to_ref(job: Any, *, category: str) -> GeneratedEvaluatorRef: if isinstance(raw_threshold, (int, float)): pass_threshold = float(raw_threshold) - valid_category: str - valid_category = category if category in ("quality", "safety") else "quality" - return GeneratedEvaluatorRef( name=str(ev_name), version=str(ev_version), - category=cast("Any", valid_category), + category=category, display_name=getattr(evaluator, "display_name", None), description=getattr(evaluator, "description", None), dimensions=dimensions, diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index 16dc9d50ce7..aee819eee71 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -6,7 +6,7 @@ import json from dataclasses import dataclass -from typing import Any +from typing import Any, cast from unittest.mock import AsyncMock, MagicMock import pytest @@ -3305,6 +3305,20 @@ def _raise() -> Any: sources=[EvalGenerationSource(type="prompt", prompt="hi")], ) + async def test_raises_value_error_on_invalid_category(self) -> None: + """category outside {quality, safety} should fail fast at the boundary.""" + from agent_framework_foundry._foundry_evals import EvalGenerationSource + + project_client = MagicMock() + + with pytest.raises(ValueError, match="category"): + await FoundryEvals.generate_rubric( + project_client=project_client, + name="my-eval", + sources=[EvalGenerationSource(type="prompt", prompt="hi")], + category=cast("Any", "invalid"), + ) + class TestGenerateRubricE2E: """End-to-end happy path for generate_rubric with mocked SDK.""" From 9a2c96404aaa9b811a15d7bde8a07eb78e22bf55 Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Wed, 27 May 2026 14:35:17 -0700 Subject: [PATCH 08/16] Python: feat(foundry-evals): hosted-agent-aware rubric generation * Auto-detect hosted Foundry agents in agent_as_eval_source: when the agent's chat_client exposes a string agent_name (the convention used by RawFoundryAgentChatClient for PromptAgents/HostedAgents), emit a type='agent' EvalGenerationSource so the service fetches instructions and tools from the agent registry instead of relying on the local wrapper (which holds neither for hosted agents). * Add hosted_agent_version kwarg and a new agent_version field on EvalGenerationSource so PromptAgent runs can pin to a specific hosted version for reproducible rubric generation. * Add force_prompt_source escape hatch to bypass auto-detection and always emit a rendered prompt dossier - useful when the local wrapper carries overrides the service-side agent doesnt see. * Fix _to_sdk_source for dataset sources: SDK ctor takes name=/version=, not dataset_name=/dataset_version=. The mismatch would raise TypeError against the real azure-ai-projects 2.3.0a* SDK; only unmocked integration paths were affected. Tests cover: auto-detection happy path, versionless hosted agent, explicit hosted_agent_version forwarding, force_prompt_source override, non-string chat_client attrs (MagicMock test doubles) not mis-detected, agent_version forwarded through _to_sdk_source, and the corrected dataset SDK kwarg names. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../agent_framework_foundry/_foundry_evals.py | 106 ++++++++++++++---- .../foundry/tests/test_foundry_evals.py | 83 +++++++++++++- 2 files changed, 168 insertions(+), 21 deletions(-) diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index 5241c4d268f..2b8d7913e08 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -183,6 +183,9 @@ class EvalGenerationSource: description: Optional short description shown in Foundry UI. prompt: Rendered dossier for ``type="prompt"`` sources. agent_name: Hosted Foundry agent name for ``type="agent"`` sources. + agent_version: Optional pinned hosted-agent version for + ``type="agent"`` sources. ``None`` resolves to the latest + version at generation time; pin for reproducible runs. dataset_name: Foundry dataset name for ``type="dataset"`` sources. dataset_version: Pinned dataset version (recommended for repro). metadata: Free-form metadata. Used by ``type="traces"`` sources @@ -194,6 +197,7 @@ class EvalGenerationSource: description: str | None = None prompt: str | None = None agent_name: str | None = None + agent_version: str | None = None dataset_name: str | None = None dataset_version: str | None = None metadata: dict[str, Any] | None = None @@ -209,41 +213,79 @@ def agent_as_eval_source( include_examples: bool = False, examples: Sequence[str] | None = None, hosted_agent_name: str | None = None, + hosted_agent_version: str | None = None, + force_prompt_source: bool = False, ) -> EvalGenerationSource: """Render an agent as an :class:`EvalGenerationSource` for rubric generation. - Wraps :meth:`BaseAgent.as_eval_source` to package the agent's - rendered dossier into a typed Foundry generation source. When - ``hosted_agent_name`` is provided, returns a ``type="agent"`` source - referencing the hosted Foundry agent so the service fetches - server-side metadata directly instead of using a rendered dossier. + Picks the best Foundry source variant for the supplied agent: + + * **Hosted Foundry agents** (``FoundryAgent`` connected to a Prompt + Agent or Hosted Agent in a Foundry project) are emitted as + ``type="agent"`` sources keyed by ``agent_name`` so the service + fetches instructions, tools, and metadata directly from the agent + registry — independent of whatever the local wrapper happens to + hold. Detected automatically from ``agent.chat_client.agent_name`` + and ``agent.chat_client.agent_version``. + * **Local agents** (any other ``BaseAgent`` whose instructions and + tools live client-side, e.g. ``FoundryChatClient``-backed agents or + pure OpenAI Responses agents) are emitted as ``type="prompt"`` + sources with a rendered text dossier. + + Override the heuristic by passing ``hosted_agent_name`` explicitly + (forces an ``"agent"`` source) or ``force_prompt_source=True`` + (forces a ``"prompt"`` source — useful when you want the service to + score a hosted agent against the *local* wrapper's overrides). Args: agent: Agent instance (typically a ``BaseAgent`` subclass). include_instructions: Whether to include the agent's instructions - text. Defaults to ``True``. - include_tools: Whether to include tool definitions. Defaults to + text in the dossier (``"prompt"`` sources only). Defaults to ``True``. + include_tools: Whether to include tool definitions in the dossier + (``"prompt"`` sources only). Defaults to ``True``. include_context_providers: Whether to include the names of - attached context-provider classes. Defaults to ``False`` to - avoid leaking implementation details. - include_examples: Whether to include the supplied ``examples``. - Defaults to ``False`` to avoid shipping potentially sensitive - sample inputs by default. + attached context-provider classes in the dossier + (``"prompt"`` sources only). Defaults to ``False`` to avoid + leaking implementation details. + include_examples: Whether to include the supplied ``examples`` in + the dossier (``"prompt"`` sources only). Defaults to + ``False`` to avoid shipping potentially sensitive sample + inputs by default. examples: Optional sample queries / interactions to include when ``include_examples`` is ``True``. hosted_agent_name: When set, emit a ``type="agent"`` source - referencing the hosted Foundry agent by name instead of a - rendered dossier. + referencing this hosted Foundry agent name regardless of + auto-detection. Use to override or supplement the + heuristic. + hosted_agent_version: When set together with a hosted-agent + source, pins the source to a specific hosted-agent version. + Recommended for reproducible rubric generation against + PromptAgents. + force_prompt_source: When ``True``, always emit a + ``type="prompt"`` source with the rendered dossier even when + the agent is a hosted Foundry agent. Useful when the local + wrapper holds overrides the service-side agent doesn't see. Returns: An :class:`EvalGenerationSource` describing the agent. """ - if hosted_agent_name: - agent_description = getattr(agent, "description", None) + agent_description = getattr(agent, "description", None) + + resolved_name = hosted_agent_name + resolved_version = hosted_agent_version + if resolved_name is None and not force_prompt_source: + detected_name, detected_version = _detect_hosted_foundry_agent(agent) + if detected_name is not None: + resolved_name = detected_name + if resolved_version is None: + resolved_version = detected_version + + if resolved_name is not None and not force_prompt_source: return EvalGenerationSource( type="agent", - agent_name=hosted_agent_name, + agent_name=resolved_name, + agent_version=resolved_version, description=agent_description, ) @@ -254,7 +296,6 @@ def agent_as_eval_source( include_examples=include_examples, examples=examples, ) - agent_description = getattr(agent, "description", None) return EvalGenerationSource( type="prompt", prompt=prompt, @@ -262,6 +303,28 @@ def agent_as_eval_source( ) +def _detect_hosted_foundry_agent(agent: BaseAgent) -> tuple[str | None, str | None]: + """Return ``(agent_name, agent_version)`` for hosted Foundry agents, else ``(None, None)``. + + A hosted Foundry agent is one whose ``chat_client`` exposes a string + ``agent_name`` — the convention used by ``RawFoundryAgentChatClient`` + when ``FoundryAgent`` connects to an existing Prompt Agent or Hosted + Agent in a Foundry project. Only string values are accepted so + test doubles using ``MagicMock`` for ``chat_client`` are not + mis-detected. + """ + chat_client = getattr(agent, "chat_client", None) + if chat_client is None: + return None, None + name = getattr(chat_client, "agent_name", None) + version = getattr(chat_client, "agent_version", None) + if not isinstance(name, str) or not name: + return None, None + if not isinstance(version, str) or not version: + version = None + return name, version + + @experimental(feature_id=ExperimentalFeature.EVALS) def workflow_as_eval_source( workflow: Workflow, @@ -1354,6 +1417,8 @@ def _to_sdk_source(source: EvalGenerationSource, sdk_types: _GenerationSdkTypes) if not source.agent_name: raise ValueError("EvalGenerationSource(type='agent') requires agent_name.") kwargs = {"agent_name": source.agent_name} + if source.agent_version is not None: + kwargs["agent_version"] = source.agent_version if source.description is not None: kwargs["description"] = source.description return sdk_types.AgentSource(**kwargs) @@ -1364,9 +1429,10 @@ def _to_sdk_source(source: EvalGenerationSource, sdk_types: _GenerationSdkTypes) ) if not source.dataset_name: raise ValueError("EvalGenerationSource(type='dataset') requires dataset_name.") - kwargs = {"dataset_name": source.dataset_name} + # SDK uses ``name`` / ``version`` (not ``dataset_name`` / ``dataset_version``). + kwargs = {"name": source.dataset_name} if source.dataset_version is not None: - kwargs["dataset_version"] = source.dataset_version + kwargs["version"] = source.dataset_version if source.description is not None: kwargs["description"] = source.description return sdk_types.DatasetSource(**kwargs) diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index aee819eee71..7244347e05b 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -3020,6 +3020,75 @@ def test_hosted_agent_name_emits_agent_source(self) -> None: assert source.prompt is None assert source.description == "Looks up the weather." + def test_explicit_hosted_agent_version_forwarded(self) -> None: + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot") + source = agent_as_eval_source( + agent, + hosted_agent_name="weather-bot-hosted-id", + hosted_agent_version="3", + ) + assert source.type == "agent" + assert source.agent_name == "weather-bot-hosted-id" + assert source.agent_version == "3" + + def test_auto_detects_hosted_foundry_agent(self) -> None: + """A chat_client carrying agent_name/agent_version is treated as a hosted agent.""" + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") + agent.chat_client = MagicMock() + agent.chat_client.agent_name = "weather-prompt-agent" + agent.chat_client.agent_version = "2" + + source = agent_as_eval_source(agent) + assert source.type == "agent" + assert source.agent_name == "weather-prompt-agent" + assert source.agent_version == "2" + assert source.prompt is None + assert source.description == "Looks up the weather." + + def test_auto_detection_handles_versionless_hosted_agent(self) -> None: + """HostedAgents typically omit agent_version (no None forwarded).""" + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot") + agent.chat_client = MagicMock() + agent.chat_client.agent_name = "weather-hosted-agent" + agent.chat_client.agent_version = None + + source = agent_as_eval_source(agent) + assert source.type == "agent" + assert source.agent_name == "weather-hosted-agent" + assert source.agent_version is None + + def test_force_prompt_source_overrides_auto_detection(self) -> None: + """force_prompt_source=True falls back to dossier even for hosted agents.""" + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") + agent.chat_client = MagicMock() + agent.chat_client.agent_name = "weather-prompt-agent" + agent.chat_client.agent_version = "2" + + source = agent_as_eval_source(agent, force_prompt_source=True) + assert source.type == "prompt" + assert source.prompt is not None + assert "Agent name: weather-bot" in source.prompt + + def test_auto_detection_ignores_non_string_chat_client_fields(self) -> None: + """Bare MagicMock chat_client (untyped attrs) must not trigger detection.""" + from agent_framework_foundry._foundry_evals import agent_as_eval_source + + agent = _make_stub_agent(name="local-agent") + agent.chat_client = MagicMock() # agent_name attr resolves to a MagicMock, not a str + + source = agent_as_eval_source(agent) + assert source.type == "prompt" + assert source.prompt is not None + assert "Agent name: local-agent" in source.prompt + def test_forwards_keyword_options_to_agent(self) -> None: from agent_framework_foundry._foundry_evals import agent_as_eval_source @@ -3192,7 +3261,19 @@ def test_dataset_source_is_translated(self) -> None: sdk, ) assert out == "dataset-sdk-instance" - sdk.DatasetSource.assert_called_once_with(dataset_name="ds", dataset_version="1") + sdk.DatasetSource.assert_called_once_with(name="ds", version="1") + + def test_agent_source_forwards_agent_version(self) -> None: + from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source + + sdk = self._make_sdk_types() + sdk.AgentSource.return_value = "agent-sdk-instance" + out = _to_sdk_source( + EvalGenerationSource(type="agent", agent_name="prompt-agent", agent_version="2"), + sdk, + ) + assert out == "agent-sdk-instance" + sdk.AgentSource.assert_called_once_with(agent_name="prompt-agent", agent_version="2") class TestPollGenerationJob: From 31f81078243de9eb79584c037dec881da3d36204 Mon Sep 17 00:00:00 2001 From: alliscode <25218250+alliscode@users.noreply.github.com> Date: Thu, 28 May 2026 09:12:57 -0700 Subject: [PATCH 09/16] fix(foundry-evals): accept canonical dimension_scores key per docs The published Foundry rubric-evaluator output (Microsoft Learn 'Rubric evaluators' reference) places per-dimension breakdowns under properties.dimension_scores, not properties.rubric_scores. The parser now tries dimension_scores first and falls back to rubric_scores for preview-build compatibility, and tolerates non-list payloads (e.g. MagicMock auto-attrs) by trying the next candidate when parsing yields zero entries. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../agent_framework_foundry/_foundry_evals.py | 99 +++++++++++++------ .../foundry/tests/test_foundry_evals.py | 50 ++++++++++ 2 files changed, 117 insertions(+), 32 deletions(-) diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index 2b8d7913e08..ae9de97d6cf 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -28,7 +28,7 @@ import asyncio import logging -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Literal, cast @@ -723,42 +723,31 @@ def _extract_per_evaluator(run: RunRetrieveResponse) -> dict[str, dict[str, int] return per_eval -def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None: - """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload. +_RUBRIC_DIMENSION_KEYS: tuple[str, ...] = ("dimension_scores", "rubric_scores") +"""Property keys that may carry per-dimension rubric breakdowns. - Foundry rubric evaluators include a per-dimension breakdown under - ``properties.rubric_scores`` on each result. The exact location may - vary across SDK versions, so this helper accepts a few shapes: +The published Foundry rubric-evaluator output format uses +``properties.dimension_scores`` (see the Microsoft Learn "Rubric +evaluators" reference). Earlier preview builds and some SDK shapes +used ``rubric_scores``; we accept both for defensive forward/backward +compatibility. +""" - * The SDK ``sample`` object exposes ``properties.rubric_scores``. - * The ``sample`` is a dict containing ``properties.rubric_scores``. - * The ``sample`` is a dict with ``rubric_scores`` at the top level. - Returns ``None`` when no rubric scores are present (i.e. the - evaluator was not a rubric evaluator). - """ - if sample is None: - return None - - raw: Any = None - properties: Any = getattr(sample, "properties", None) - if properties is not None: - raw = getattr(properties, "rubric_scores", None) - if raw is None and isinstance(properties, dict): - raw = cast("dict[str, Any]", properties).get("rubric_scores") - if raw is None and isinstance(sample, dict): - sample_any = cast("dict[str, Any]", sample) - props_dict: Any = sample_any.get("properties") - if isinstance(props_dict, dict): - raw = cast("dict[str, Any]", props_dict).get("rubric_scores") - if raw is None: - raw = sample_any.get("rubric_scores") +def _parse_dimension_entries(raw: Any) -> list[RubricScore]: + """Parse a raw list-like payload into ``RubricScore`` instances. + Returns an empty list when ``raw`` is falsy, not iterable, or + contains no well-formed entries. + """ if not raw: - return None + return [] + try: + raw_iter: Iterable[Any] = iter(raw) + except TypeError: + return [] parsed: list[RubricScore] = [] - raw_iter: Any = raw for raw_entry in raw_iter: entry: Any = raw_entry try: @@ -792,8 +781,54 @@ def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None: ) ) except (TypeError, ValueError): - logger.debug("Skipping malformed rubric_scores entry: %s", cast("Any", entry), exc_info=True) - return parsed or None + logger.debug("Skipping malformed rubric dimension entry: %s", cast("Any", entry), exc_info=True) + return parsed + + +def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None: + """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload. + + Foundry rubric evaluators include a per-dimension breakdown under + ``properties.dimension_scores`` on each result (preview builds used + ``rubric_scores``; both keys are accepted, with the canonical + ``dimension_scores`` taking priority). The exact location may + vary across SDK versions, so this helper accepts a few shapes: + + * The SDK ``sample`` object exposes + ``properties.dimension_scores`` / ``properties.rubric_scores``. + * The ``sample`` is a dict containing the same under + ``properties.``. + * The ``sample`` is a dict with ``dimension_scores`` / + ``rubric_scores`` at the top level. + + Returns ``None`` when no rubric scores are present (i.e. the + evaluator was not a rubric evaluator). + """ + if sample is None: + return None + + containers: list[Any] = [] + properties: Any = getattr(sample, "properties", None) + if properties is not None: + containers.append(properties) + if isinstance(sample, dict): + sample_any = cast("dict[str, Any]", sample) + props_dict: Any = sample_any.get("properties") + if props_dict is not None and props_dict is not properties: + containers.append(props_dict) + containers.append(sample_any) + + for container in containers: + for key in _RUBRIC_DIMENSION_KEYS: + raw: Any = None + if isinstance(container, dict): + raw = cast("dict[str, Any]", container).get(key) + elif hasattr(container, key): + raw = getattr(container, key, None) + parsed = _parse_dimension_entries(raw) + if parsed: + return parsed + return None async def _fetch_output_items( diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index 7244347e05b..bffb0c066a7 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -2605,6 +2605,56 @@ def test_skips_malformed_entries(self) -> None: assert len(result) == 1 assert result[0].id == "good" + def test_canonical_dimension_scores_key_from_docs(self) -> None: + """Per the Microsoft Learn docs, runtime output uses ``properties.dimension_scores``.""" + from agent_framework_foundry._foundry_evals import _extract_rubric_scores + + sample = { + "properties": { + "dimension_scores": [ + { + "id": "intent_recognition", + "score": 5, + "applicable": True, + "weight": 9, + "reason": "Identified correctly.", + }, + { + "id": "general_quality", + "score": 4, + "applicable": True, + "weight": 5, + "reason": "Strong overall.", + }, + ] + } + } + result = _extract_rubric_scores(sample) + assert result is not None + assert [r.id for r in result] == ["intent_recognition", "general_quality"] + assert [r.score for r in result] == [5, 4] + assert [r.weight for r in result] == [9, 5] + + def test_dimension_scores_via_attribute(self) -> None: + """Canonical key also resolves when properties exposes ``dimension_scores`` as an attr.""" + from agent_framework_foundry._foundry_evals import _extract_rubric_scores + + rs = MagicMock() + rs.id = "policy_enforcement" + rs.score = 1 + rs.applicable = True + rs.weight = 5 + rs.reason = "violated" + + sample = MagicMock() + sample.properties = MagicMock(spec=["dimension_scores"]) + sample.properties.dimension_scores = [rs] + + result = _extract_rubric_scores(sample) + assert result is not None + assert result[0].id == "policy_enforcement" + assert result[0].score == 1 + # --------------------------------------------------------------------------- # _poll_eval_run — timeout / failed / canceled paths From f76343059d2e8648d672ed92a6b475e4524b9e87 Mon Sep 17 00:00:00 2001 From: alliscode <25218250+alliscode@users.noreply.github.com> Date: Thu, 28 May 2026 09:23:28 -0700 Subject: [PATCH 10/16] feat(foundry-evals): add manual create_rubric_evaluator Adds FoundryEvals.create_rubric_evaluator as the agent-framework surface over project_client.beta.evaluators.create_version. This is the manual counterpart to generate_rubric: callers supply RubricDimension instances (authored locally, ported from another framework, or hand-tuned) and we POST a RubricBasedEvaluatorDefinition. The service auto-attaches the non-editable residual dimension (general_quality for quality, general_policy_compliance for safety). Per the Microsoft Learn 'Rubric evaluators' reference, the auto-generation path (create_generation_job) is primarily a portal/UI feature; external SDK clients with rich local agent context are better served by manual create_version. This keeps generate_rubric for users who want to round-trip through a Foundry-registered agent. Validation up front: weight must be in [1,10], ids unique, descriptions non-empty, pass_threshold in [0,1]. The returned GeneratedEvaluatorRef is identical in shape to one obtained from generate_rubric, so downstream evaluators= lists work unchanged. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../agent_framework_foundry/_foundry_evals.py | 227 +++++++++++++++++ .../foundry/tests/test_foundry_evals.py | 241 ++++++++++++++++++ 2 files changed, 468 insertions(+) diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index ae9de97d6cf..7fddd64c38c 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -1349,6 +1349,104 @@ async def generate_rubric( return _generation_job_to_ref(completed, category=category) + @classmethod + @experimental(feature_id=ExperimentalFeature.EVALS) + async def create_rubric_evaluator( + cls, + *, + project_client: AIProjectClient, + name: str, + dimensions: Sequence[RubricDimension], + category: Literal["quality", "safety"] = "quality", + pass_threshold: float | None = None, + display_name: str | None = None, + description: str | None = None, + tags: dict[str, str] | None = None, + metadata: dict[str, str] | None = None, + ) -> GeneratedEvaluatorRef: + """Register a rubric evaluator from caller-supplied dimensions. + + This is the *manual* counterpart to :meth:`generate_rubric` and + maps directly to ``project_client.beta.evaluators.create_version``. + Use it to bring a rubric you authored elsewhere (e.g. authored + from an agent's local context, ported from another framework, or + hand-tuned) into Foundry as a versioned ``EvaluatorVersion`` + that any subsequent ``evaluators=`` list can reference via the + returned :class:`GeneratedEvaluatorRef`. + + The service auto-attaches a non-editable residual dimension + (``general_quality`` for ``category="quality"``, + ``general_policy_compliance`` for ``"safety"``) — do not include + it in ``dimensions``. + + Keyword Args: + project_client: Async ``AIProjectClient`` for the target + Foundry project. + name: Stable evaluator name (e.g. + ``"reservation-agent-policy-v1"``). A new version is + allocated on each call. + dimensions: One or more :class:`RubricDimension` instances + describing the scoring blueprint. Each dimension's + ``id`` must be unique; ``weight`` must be in ``[1, 10]``. + category: ``"quality"`` (default) or ``"safety"``. + pass_threshold: Optional aggregate pass threshold on the + normalized 0.0-1.0 scale. Defaults to the service-side + default of ``0.5`` when omitted. + display_name: Optional human-readable name shown in the + Foundry portal. + description: Optional asset description. + tags: Optional asset tags. + metadata: Optional free-form metadata persisted with the + evaluator definition. + + Returns: + A pinned :class:`GeneratedEvaluatorRef` referring to the + newly created evaluator version. + + Raises: + ValueError: If ``dimensions`` is empty, contains duplicate + ids, or contains a weight outside ``[1, 10]``. + NotImplementedError: If the installed ``azure-ai-projects`` + version does not expose the manual rubric APIs. + """ + if category not in ("quality", "safety"): + raise ValueError(f"category must be 'quality' or 'safety', got {category!r}.") + if pass_threshold is not None and not (0.0 <= pass_threshold <= 1.0): + raise ValueError(f"pass_threshold must be in [0.0, 1.0] when set (got {pass_threshold!r}).") + if not dimensions: + raise ValueError("create_rubric_evaluator requires at least one dimension.") + + try: + sdk_types = _import_manual_rubric_sdk_types() + except _RubricSdkUnavailableError as exc: + raise NotImplementedError(str(exc)) from exc + + sdk_dimensions = _to_sdk_dimensions(dimensions, sdk_types.Dimension) + definition_kwargs: dict[str, Any] = {"dimensions": sdk_dimensions} + if pass_threshold is not None: + definition_kwargs["pass_threshold"] = pass_threshold + definition = sdk_types.RubricBasedEvaluatorDefinition(**definition_kwargs) + + version_kwargs: dict[str, Any] = { + "evaluator_type": "custom", + "categories": [category], + "definition": definition, + } + if display_name is not None: + version_kwargs["display_name"] = display_name + if description is not None: + version_kwargs["description"] = description + if tags is not None: + version_kwargs["tags"] = tags + if metadata is not None: + version_kwargs["metadata"] = metadata + + evaluator_version = sdk_types.EvaluatorVersion(**version_kwargs) + evaluators_ops = _get_beta_evaluators(project_client) + created = await evaluators_ops.create_version(name, evaluator_version=evaluator_version) + + return _evaluator_version_to_ref(created, fallback_name=name, category=category) + _TERMINAL_GENERATION_STATUSES: frozenset[str] = frozenset({"succeeded", "failed", "cancelled", "canceled"}) @@ -1369,6 +1467,15 @@ class _GenerationSdkTypes: TracesSource: Any | None +@dataclass(frozen=True) +class _ManualRubricSdkTypes: + """Resolved SDK type handles for manual rubric-evaluator creation.""" + + EvaluatorVersion: Any + RubricBasedEvaluatorDefinition: Any + Dimension: Any + + _RUBRIC_SDK_MISSING_MSG = ( "FoundryEvals.generate_rubric requires the rubric-evaluator generation APIs " "from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev feed). " @@ -1378,6 +1485,16 @@ class _GenerationSdkTypes: ) +_MANUAL_RUBRIC_SDK_MISSING_MSG = ( + "FoundryEvals.create_rubric_evaluator requires the manual rubric-evaluator " + "APIs from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev " + "feed). Install a build that exposes " + "`azure.ai.projects.models.RubricBasedEvaluatorDefinition`, " + "`azure.ai.projects.models.Dimension`, and " + "`AIProjectClient.beta.evaluators.create_version`." +) + + def _import_generation_sdk_types() -> _GenerationSdkTypes: """Lazily resolve the rubric-generation SDK types from azure-ai-projects.""" try: @@ -1406,6 +1523,116 @@ def _import_generation_sdk_types() -> _GenerationSdkTypes: ) +def _import_manual_rubric_sdk_types() -> _ManualRubricSdkTypes: + """Lazily resolve the manual rubric-evaluator SDK types from azure-ai-projects.""" + try: + from azure.ai.projects import models as _models # type: ignore[import-not-found] + except ImportError as exc: + raise _RubricSdkUnavailableError(_MANUAL_RUBRIC_SDK_MISSING_MSG) from exc + + models_mod: Any = _models + version_cls: Any = getattr(models_mod, "EvaluatorVersion", None) + definition_cls: Any = getattr(models_mod, "RubricBasedEvaluatorDefinition", None) + dimension_cls: Any = getattr(models_mod, "Dimension", None) + if version_cls is None or definition_cls is None or dimension_cls is None: + raise _RubricSdkUnavailableError(_MANUAL_RUBRIC_SDK_MISSING_MSG) + + return _ManualRubricSdkTypes( + EvaluatorVersion=version_cls, + RubricBasedEvaluatorDefinition=definition_cls, + Dimension=dimension_cls, + ) + + +def _to_sdk_dimensions( + dimensions: Sequence[RubricDimension], + dimension_cls: Any, +) -> list[Any]: + """Translate user-facing ``RubricDimension`` instances to SDK ``Dimension`` models. + + The agent-framework type uses ``id`` (matching the runtime output + schema and competing frameworks); the SDK input model uses + ``dimension_id`` for the editable identifier. + """ + if not dimensions: + raise ValueError("create_rubric_evaluator requires at least one dimension.") + seen: set[str] = set() + sdk_dims: list[Any] = [] + for dim in dimensions: + if not dim.id: + raise ValueError("RubricDimension.id must be a non-empty string.") + if not dim.description: + raise ValueError(f"RubricDimension(id={dim.id!r}).description must be non-empty.") + if not isinstance(dim.weight, int) or not (1 <= dim.weight <= 10): + raise ValueError(f"RubricDimension(id={dim.id!r}).weight must be an int in [1, 10] (got {dim.weight!r}).") + if dim.id in seen: + raise ValueError(f"Duplicate RubricDimension.id={dim.id!r}; ids must be unique within a rubric.") + seen.add(dim.id) + kwargs: dict[str, Any] = { + "dimension_id": dim.id, + "description": dim.description, + "weight": dim.weight, + } + if dim.always_applicable: + kwargs["always_applicable"] = True + sdk_dims.append(dimension_cls(**kwargs)) + return sdk_dims + + +def _evaluator_version_to_ref( + created: Any, + *, + fallback_name: str, + category: Literal["quality", "safety"], +) -> GeneratedEvaluatorRef: + """Translate a persisted ``EvaluatorVersion`` to a :class:`GeneratedEvaluatorRef`. + + Used by both the generation-job path and the manual ``create_version`` + path so callers see a uniform pinned reference regardless of how the + evaluator was authored. + """ + ev_name = getattr(created, "name", None) or fallback_name + ev_version = getattr(created, "version", None) + if ev_version is None: + raise RuntimeError("Created evaluator version is missing a version identifier.") + + definition: Any = getattr(created, "definition", None) + dimensions: tuple[RubricDimension, ...] | None = None + raw_dims: Any = getattr(definition, "dimensions", None) if definition is not None else None + if raw_dims: + parsed: list[RubricDimension] = [] + for entry in raw_dims: + dim_id = getattr(entry, "dimension_id", None) or getattr(entry, "id", None) + try: + parsed.append( + RubricDimension( + id=str(dim_id or ""), + description=str(getattr(entry, "description", "") or ""), + weight=int(getattr(entry, "weight", 0) or 0), + always_applicable=bool(getattr(entry, "always_applicable", False)), + ) + ) + except (TypeError, ValueError): + logger.debug("Skipping malformed dimension on persisted evaluator", exc_info=True) + if parsed: + dimensions = tuple(parsed) + + pass_threshold: float | None = None + raw_threshold: Any = getattr(definition, "pass_threshold", None) if definition is not None else None + if isinstance(raw_threshold, (int, float)): + pass_threshold = float(raw_threshold) + + return GeneratedEvaluatorRef( + name=str(ev_name), + version=str(ev_version), + category=category, + display_name=getattr(created, "display_name", None), + description=getattr(created, "description", None), + dimensions=dimensions, + pass_threshold=pass_threshold, + ) + + def _get_beta_evaluators(project_client: AIProjectClient) -> Any: """Return the ``project_client.beta.evaluators`` operations group, or raise.""" beta = getattr(project_client, "beta", None) diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index bffb0c066a7..d24c528a744 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -27,6 +27,7 @@ from agent_framework_foundry._foundry_evals import ( FoundryEvals, + RubricDimension, _build_item_schema, _build_testing_criteria, _extract_per_evaluator, @@ -3530,3 +3531,243 @@ async def test_generate_rubric_from_agent(self, monkeypatch: pytest.MonkeyPatch) job_cls.assert_called_once_with(inputs="sdk-inputs") evaluators_ops.create_generation_job.assert_awaited_once_with(job="sdk-job", operation_id="op-123") + + +# --------------------------------------------------------------------------- +# FoundryEvals.create_rubric_evaluator — manual rubric registration +# --------------------------------------------------------------------------- + + +class TestCreateRubricEvaluatorValidation: + """Argument validation for ``FoundryEvals.create_rubric_evaluator``.""" + + async def test_rejects_empty_dimensions(self) -> None: + with pytest.raises(ValueError, match="at least one dimension"): + await FoundryEvals.create_rubric_evaluator( + project_client=MagicMock(), + name="x", + dimensions=[], + ) + + async def test_rejects_invalid_category(self) -> None: + with pytest.raises(ValueError, match="category"): + await FoundryEvals.create_rubric_evaluator( + project_client=MagicMock(), + name="x", + dimensions=[RubricDimension(id="a", description="d", weight=5)], + category="bogus", # type: ignore[arg-type] + ) + + async def test_rejects_out_of_range_pass_threshold(self) -> None: + with pytest.raises(ValueError, match="pass_threshold"): + await FoundryEvals.create_rubric_evaluator( + project_client=MagicMock(), + name="x", + dimensions=[RubricDimension(id="a", description="d", weight=5)], + pass_threshold=1.5, + ) + + async def test_rejects_duplicate_dimension_ids(self, monkeypatch: pytest.MonkeyPatch) -> None: + from agent_framework_foundry import _foundry_evals as fm + + sdk = fm._ManualRubricSdkTypes( + EvaluatorVersion=MagicMock(), + RubricBasedEvaluatorDefinition=MagicMock(), + Dimension=MagicMock(), + ) + monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk) + with pytest.raises(ValueError, match="Duplicate"): + await FoundryEvals.create_rubric_evaluator( + project_client=MagicMock(), + name="x", + dimensions=[ + RubricDimension(id="dup", description="d1", weight=5), + RubricDimension(id="dup", description="d2", weight=3), + ], + ) + + async def test_rejects_weight_out_of_range(self, monkeypatch: pytest.MonkeyPatch) -> None: + from agent_framework_foundry import _foundry_evals as fm + + sdk = fm._ManualRubricSdkTypes( + EvaluatorVersion=MagicMock(), + RubricBasedEvaluatorDefinition=MagicMock(), + Dimension=MagicMock(), + ) + monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk) + with pytest.raises(ValueError, match="weight"): + await FoundryEvals.create_rubric_evaluator( + project_client=MagicMock(), + name="x", + dimensions=[RubricDimension(id="a", description="d", weight=0)], + ) + + async def test_rejects_empty_description(self, monkeypatch: pytest.MonkeyPatch) -> None: + from agent_framework_foundry import _foundry_evals as fm + + sdk = fm._ManualRubricSdkTypes( + EvaluatorVersion=MagicMock(), + RubricBasedEvaluatorDefinition=MagicMock(), + Dimension=MagicMock(), + ) + monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk) + with pytest.raises(ValueError, match="description"): + await FoundryEvals.create_rubric_evaluator( + project_client=MagicMock(), + name="x", + dimensions=[RubricDimension(id="a", description="", weight=5)], + ) + + +class TestCreateRubricEvaluatorSdkMissing: + async def test_raises_not_implemented_when_sdk_lacks_types(self, monkeypatch: pytest.MonkeyPatch) -> None: + from agent_framework_foundry import _foundry_evals as fm + + def _raise() -> Any: + raise fm._RubricSdkUnavailableError("nope") + + monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", _raise) + with pytest.raises(NotImplementedError, match="nope"): + await FoundryEvals.create_rubric_evaluator( + project_client=MagicMock(), + name="x", + dimensions=[RubricDimension(id="a", description="d", weight=5)], + ) + + +class TestCreateRubricEvaluatorE2E: + """End-to-end happy path for create_rubric_evaluator with mocked SDK.""" + + async def test_calls_create_version_with_rubric_definition(self, monkeypatch: pytest.MonkeyPatch) -> None: + from agent_framework_foundry import _foundry_evals as fm + + dimension_cls = MagicMock(name="Dimension", side_effect=lambda **kw: ("dim", kw)) + definition_cls = MagicMock(name="RubricBasedEvaluatorDefinition", side_effect=lambda **kw: ("def", kw)) + version_cls = MagicMock(name="EvaluatorVersion", side_effect=lambda **kw: ("ver", kw)) + + sdk = fm._ManualRubricSdkTypes( + EvaluatorVersion=version_cls, + RubricBasedEvaluatorDefinition=definition_cls, + Dimension=dimension_cls, + ) + monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk) + + created_definition = MagicMock() + created_definition.dimensions = [ + MagicMock(dimension_id="intent", description="d1", weight=9, always_applicable=False), + MagicMock(dimension_id="general_quality", description="g", weight=5, always_applicable=True), + ] + created_definition.pass_threshold = 0.7 + created_version = MagicMock( + display_name="DN", + description="hand-authored", + ) + created_version.name = "policy-eval" + created_version.version = "3" + created_version.definition = created_definition + + evaluators_ops = MagicMock() + evaluators_ops.create_version = AsyncMock(return_value=created_version) + project_client = MagicMock() + project_client.beta = MagicMock(evaluators=evaluators_ops) + + ref = await FoundryEvals.create_rubric_evaluator( + project_client=project_client, + name="policy-eval", + dimensions=[ + RubricDimension(id="intent", description="d1", weight=9), + RubricDimension(id="general_quality", description="g", weight=5, always_applicable=True), + ], + category="quality", + pass_threshold=0.7, + display_name="DN", + description="hand-authored", + tags={"team": "agents"}, + metadata={"source": "manual"}, + ) + + # Returned ref carries the persisted (name, version) and snapshot of dimensions. + assert ref.name == "policy-eval" + assert ref.version == "3" + assert ref.category == "quality" + assert ref.pass_threshold == 0.7 + assert ref.dimensions is not None + assert [d.id for d in ref.dimensions] == ["intent", "general_quality"] + assert ref.dimensions[1].always_applicable is True + + # Dimension construction used dimension_id, included always_applicable only when True. + assert dimension_cls.call_count == 2 + first_kwargs = dimension_cls.call_args_list[0].kwargs + assert first_kwargs == {"dimension_id": "intent", "description": "d1", "weight": 9} + second_kwargs = dimension_cls.call_args_list[1].kwargs + assert second_kwargs == { + "dimension_id": "general_quality", + "description": "g", + "weight": 5, + "always_applicable": True, + } + + # Definition construction forwarded pass_threshold and the two sdk dimensions. + definition_cls.assert_called_once() + def_kwargs = definition_cls.call_args.kwargs + assert def_kwargs["pass_threshold"] == 0.7 + assert def_kwargs["dimensions"] == [ + ("dim", {"dimension_id": "intent", "description": "d1", "weight": 9}), + ( + "dim", + { + "dimension_id": "general_quality", + "description": "g", + "weight": 5, + "always_applicable": True, + }, + ), + ] + + # EvaluatorVersion construction passed evaluator_type="custom", category list, and optionals. + version_cls.assert_called_once() + ver_kwargs = version_cls.call_args.kwargs + assert ver_kwargs["evaluator_type"] == "custom" + assert ver_kwargs["categories"] == ["quality"] + assert ver_kwargs["display_name"] == "DN" + assert ver_kwargs["description"] == "hand-authored" + assert ver_kwargs["tags"] == {"team": "agents"} + assert ver_kwargs["metadata"] == {"source": "manual"} + + # SDK ops invoked with name + evaluator_version kwarg. + evaluators_ops.create_version.assert_awaited_once() + call = evaluators_ops.create_version.await_args + assert call.args == ("policy-eval",) + assert "evaluator_version" in call.kwargs + + async def test_omits_pass_threshold_when_not_set(self, monkeypatch: pytest.MonkeyPatch) -> None: + from agent_framework_foundry import _foundry_evals as fm + + dimension_cls = MagicMock(side_effect=lambda **kw: kw) + definition_cls = MagicMock(side_effect=lambda **kw: kw) + version_cls = MagicMock(side_effect=lambda **kw: kw) + + sdk = fm._ManualRubricSdkTypes( + EvaluatorVersion=version_cls, + RubricBasedEvaluatorDefinition=definition_cls, + Dimension=dimension_cls, + ) + monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk) + + created = MagicMock(display_name=None, description=None) + created.name = "x" + created.version = "1" + created.definition = MagicMock(dimensions=[], pass_threshold=None) + + evaluators_ops = MagicMock() + evaluators_ops.create_version = AsyncMock(return_value=created) + project_client = MagicMock() + project_client.beta = MagicMock(evaluators=evaluators_ops) + + ref = await FoundryEvals.create_rubric_evaluator( + project_client=project_client, + name="x", + dimensions=[RubricDimension(id="a", description="d", weight=5)], + ) + assert ref.pass_threshold is None + assert "pass_threshold" not in definition_cls.call_args.kwargs From 484b98d44256583d48f4c47acea1fcfe13e81ce1 Mon Sep 17 00:00:00 2001 From: alliscode <25218250+alliscode@users.noreply.github.com> Date: Thu, 28 May 2026 09:38:02 -0700 Subject: [PATCH 11/16] samples(foundry-evals): manual rubric sample + namespace re-exports Adds evaluate_with_manual_rubric_sample.py demonstrating the end-to-end dev scenario for FoundryEvals.create_rubric_evaluator: hand-author a list of RubricDimension, register via create_rubric_evaluator, then use the pinned GeneratedEvaluatorRef alongside built-in evaluators in an agent regression run. Also re-exports RubricDimension, GeneratedEvaluatorRef, build_sources, and load_evals_config from agent_framework.foundry (both the lazy runtime shim and the type stub) so the rubric samples can import everything from a single namespace; the auto-generate sample was previously broken because the shim was missing build_sources / load_evals_config. Updates the foundry-evals README with a chooser entry for the two rubric paths. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../core/agent_framework/foundry/__init__.py | 4 + .../core/agent_framework/foundry/__init__.pyi | 8 + .../evaluation/foundry_evals/README.md | 22 +++ .../evaluate_with_manual_rubric_sample.py | 172 ++++++++++++++++++ 4 files changed, 206 insertions(+) create mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py diff --git a/python/packages/core/agent_framework/foundry/__init__.py b/python/packages/core/agent_framework/foundry/__init__.py index 82a476ddff8..2abd2b58e90 100644 --- a/python/packages/core/agent_framework/foundry/__init__.py +++ b/python/packages/core/agent_framework/foundry/__init__.py @@ -34,13 +34,17 @@ "FoundryLocalChatOptions": ("agent_framework_foundry_local", "agent-framework-foundry-local"), "FoundryLocalClient": ("agent_framework_foundry_local", "agent-framework-foundry-local"), "FoundryLocalSettings": ("agent_framework_foundry_local", "agent-framework-foundry-local"), + "GeneratedEvaluatorRef": ("agent_framework_foundry", "agent-framework-foundry"), "RawAnthropicFoundryClient": ("agent_framework_anthropic", "agent-framework-anthropic"), "RawFoundryAgent": ("agent_framework_foundry", "agent-framework-foundry"), "RawFoundryAgentChatClient": ("agent_framework_foundry", "agent-framework-foundry"), "RawFoundryChatClient": ("agent_framework_foundry", "agent-framework-foundry"), "RawFoundryEmbeddingClient": ("agent_framework_foundry", "agent-framework-foundry"), + "RubricDimension": ("agent_framework_foundry", "agent-framework-foundry"), + "build_sources": ("agent_framework_foundry", "agent-framework-foundry"), "evaluate_foundry_target": ("agent_framework_foundry", "agent-framework-foundry"), "evaluate_traces": ("agent_framework_foundry", "agent-framework-foundry"), + "load_evals_config": ("agent_framework_foundry", "agent-framework-foundry"), } diff --git a/python/packages/core/agent_framework/foundry/__init__.pyi b/python/packages/core/agent_framework/foundry/__init__.pyi index 7deb709c2a3..abcf45868f2 100644 --- a/python/packages/core/agent_framework/foundry/__init__.pyi +++ b/python/packages/core/agent_framework/foundry/__init__.pyi @@ -20,12 +20,16 @@ from agent_framework_foundry import ( FoundryEmbeddingSettings, FoundryEvals, FoundryMemoryProvider, + GeneratedEvaluatorRef, RawFoundryAgent, RawFoundryAgentChatClient, RawFoundryChatClient, RawFoundryEmbeddingClient, + RubricDimension, + build_sources, evaluate_foundry_target, evaluate_traces, + load_evals_config, ) from agent_framework_foundry_local import ( FoundryLocalChatOptions, @@ -51,11 +55,15 @@ __all__ = [ "FoundryLocalClient", "FoundryLocalSettings", "FoundryMemoryProvider", + "GeneratedEvaluatorRef", "RawAnthropicFoundryClient", "RawFoundryAgent", "RawFoundryAgentChatClient", "RawFoundryChatClient", "RawFoundryEmbeddingClient", + "RubricDimension", + "build_sources", "evaluate_foundry_target", "evaluate_traces", + "load_evals_config", ] diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md index 81412a7f0ef..b7f8f7cc1b6 100644 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md @@ -35,6 +35,26 @@ Evaluate what already happened — zero changes to agent code: uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py ``` +### `evaluate_with_generated_rubric_sample.py` — Auto-Generate a Rubric + +Let Foundry draft the rubric dimensions for you from the agent's +context (instructions, tools, description). Best when you don't yet +have a fixed scoring rubric and want a strong baseline you can refine. + +```bash +uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py +``` + +### `evaluate_with_manual_rubric_sample.py` — Author a Rubric Yourself + +Bring your own `RubricDimension`s (from a spec, a competing framework, +or hand tuning) and register them as a versioned evaluator. Use this +when you already know what you want to score. + +```bash +uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py +``` + ## Setup Create a `.env` file with configuration as in the `.env.example` file in this folder. @@ -44,3 +64,5 @@ Create a `.env` file with configuration as in the `.env.example` file in this fo - **"I want to test my agent during development"** → `evaluate_agent_sample.py`, Pattern 1 - **"I want to evaluate past agent runs"** → `evaluate_traces_sample.py` - **"I want to inspect/modify eval data before submitting"** → `evaluate_agent_sample.py`, Pattern 2 +- **"I want Foundry to draft a custom rubric for my agent"** → `evaluate_with_generated_rubric_sample.py` +- **"I already have a rubric I want to bring into Foundry"** → `evaluate_with_manual_rubric_sample.py` diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py new file mode 100644 index 00000000000..e1fc86ef71c --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py @@ -0,0 +1,172 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Register a hand-authored rubric evaluator and use it in CI. + +This sample demonstrates the *manual* counterpart to +``evaluate_with_generated_rubric_sample.py``: + +1. Build an agent. +2. Author the rubric dimensions yourself — useful when you have an + established scoring rubric (from a spec, a competing framework, or + prior hand tuning) that you want to bring into Foundry as-is. +3. Register the rubric with + :meth:`FoundryEvals.create_rubric_evaluator` — this maps directly to + ``project_client.beta.evaluators.create_version`` and returns a + pinned ``GeneratedEvaluatorRef`` you can store in source control. +4. Use the pinned reference in ``evaluators=[...]`` for a regression run + alongside built-in evaluators. + +The service auto-attaches a non-editable residual dimension +(``general_quality`` for ``category="quality"``, +``general_policy_compliance`` for ``"safety"``) — do not include it in +``dimensions``. + +Prefer :meth:`FoundryEvals.generate_rubric` if you want Foundry to +draft the dimensions for you from the agent's context. Use this manual +flow when you already know what you want to score. + +Prerequisites: +- An Azure AI Foundry project with a deployed model. +- ``azure-ai-projects`` build that includes the rubric APIs (currently + ``2.3.0a*`` on the Azure SDK Python dev feed). +- Set ``FOUNDRY_PROJECT_ENDPOINT`` and ``FOUNDRY_MODEL`` in ``.env``. + +Run with: + +.. code-block:: bash + + az login + python evaluate_with_manual_rubric_sample.py +""" + +import asyncio +import os + +from agent_framework import evaluate_agent +from agent_framework.foundry import ( + FoundryChatClient, + FoundryEvals, + RubricDimension, +) +from azure.ai.projects.aio import AIProjectClient +from azure.identity.aio import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + + +def get_weather(location: str) -> str: + """Get the current weather for a location.""" + samples = { + "seattle": "62F, cloudy with a chance of rain", + "london": "55F, overcast", + "paris": "68F, partly sunny", + } + return samples.get(location.lower(), f"Weather data not available for {location}") + + +# Hand-authored rubric — this is the artifact you commit alongside the +# agent so the rubric and the behavior it scores evolve together. +# Weights are 1-10 (the generation pipeline biases one dimension to +# 8-10; manual edits aren't constrained by this heuristic). +TRAVEL_RUBRIC_DIMENSIONS: list[RubricDimension] = [ + RubricDimension( + id="tool_grounding", + description=( + "Grounds every weather claim in tool output. Does not invent values when " + "the tool returns no data, and does not paraphrase tool output in a way " + "that distorts the underlying values." + ), + weight=9, + ), + RubricDimension( + id="scope_adherence", + description=( + "Stays within travel-planning scope. Politely declines or redirects " + "questions about topics unrelated to travel (e.g. general trivia, " + "personal advice, coding questions)." + ), + weight=6, + ), + RubricDimension( + id="actionable_recommendation", + description=( + "Provides a clear, actionable recommendation grounded in the tool result " + "(e.g. 'Pack an umbrella' when rain is reported), not just a restatement " + "of the raw weather data." + ), + weight=4, + ), +] + + +async def main() -> None: + project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] + model_name = os.environ.get("FOUNDRY_MODEL", "gpt-4o") + + credential = AzureCliCredential() + chat_client = FoundryChatClient( + project_endpoint=project_endpoint, + model=model_name, + credential=credential, + ) + project_client = AIProjectClient(endpoint=project_endpoint, credential=credential) + + agent = chat_client.as_agent( + name="travel-assistant", + instructions=( + "You are a helpful travel assistant. Always ground recommendations in " + "tool output, cite each tool result, and refuse questions outside travel " + "planning." + ), + tools=[get_weather], + ) + + # 1. Register (or bump the version of) the hand-authored rubric. + # The service auto-attaches the non-editable `general_quality` + # residual dimension for quality rubrics. + print("Registering manual rubric evaluator...") + rubric_ref = await FoundryEvals.create_rubric_evaluator( + project_client=project_client, + name="travel-quality-manual", + dimensions=TRAVEL_RUBRIC_DIMENSIONS, + category="quality", + pass_threshold=0.6, + display_name="Travel Quality (Manual)", + description="Hand-authored rubric for the travel-assistant agent.", + ) + print( + f"Registered rubric {rubric_ref.name}@{rubric_ref.version} " + f"with {len(rubric_ref.dimensions or ())} dimensions " + f"(pass_threshold={rubric_ref.pass_threshold})" + ) + + # 2. Run an evaluation that combines built-ins with the new rubric. + evals = FoundryEvals( + client=chat_client, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY, rubric_ref], + ) + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather in Seattle?", + "Should I pack an umbrella for London?", + "What's the capital of France?", # off-scope — exercises scope_adherence + ], + evaluators=evals, + ) + + # 3. Quality gates — wire these into your CI job's exit status. + for r in results: + print(f"\nRun {r.run_id}: {r.passed}/{r.total} passed; portal: {r.report_url}") + r.assert_no_failed_items() + r.assert_score_at_least(0.7) + r.assert_dimension_score_at_least("tool_grounding", 3) + r.assert_dimension_score_at_least("scope_adherence", 3) + + await project_client.close() + await credential.close() + + +if __name__ == "__main__": + asyncio.run(main()) From 972b55f70c73780a6fe4bc2c5cf4a8be1180bb98 Mon Sep 17 00:00:00 2001 From: alliscode <25218250+alliscode@users.noreply.github.com> Date: Thu, 28 May 2026 10:01:37 -0700 Subject: [PATCH 12/16] feat(foundry-evals): remove rubric creation flows; keep consumption only Reframes agent-framework as a pure consumer of Foundry rubric evaluators: scoring against rubrics that already exist (authored in the Foundry portal or via the dedicated SDK / REST surface) instead of creating them from the SDK. Removed creation surface area: - FoundryEvals.generate_rubric (auto-generate path) and create_rubric_evaluator (manual path), plus all _GenerationSdkTypes / _ManualRubricSdkTypes / _to_sdk_dimensions / _coalesce_generation_sources / _to_sdk_source / _poll_generation_job / _generation_job_to_ref / _evaluator_version_to_ref / _get_beta_evaluators / _import_*_sdk_types helpers. - EvalGenerationSource (the input source discriminator), RubricDimension (the input dimension type), agent_as_eval_source / workflow_as_eval_source / _detect_hosted_foundry_agent helpers, and the YAML-config loader (_evals_config.py with RubricGenerationSpec / RubricSourceSpec / parse_evals_config / load_evals_config / build_sources). - BaseAgent.as_eval_source / Workflow.as_eval_source plus the _render_agent_dossier / _render_workflow_dossier helpers in core. These existed only to feed the now-removed generation pipeline. - Samples evaluate_with_generated_rubric_sample.py, evaluate_with_manual_rubric_sample.py, and evaluators.yaml. Replaced with a short README section showing how to reference an existing rubric evaluator via GeneratedEvaluatorRef. Kept (consumption surface): - GeneratedEvaluatorRef, slimmed to (name, version, display_name). Still accepted alongside built-in evaluator strings in FoundryEvals(evaluators=[...]). Versionless refs still warn. - RubricScore on EvalScoreResult.dimensions plus EvalResults.assert_dimension_score_at_least for per-dimension CI gates. - _parse_dimension_entries / _extract_rubric_scores output parsing (both canonical dimension_scores and the legacy rubric_scores key). Tests: 160/160 foundry unit tests and 71/71 core local-eval tests pass; pyright is clean across changed files. The pre-existing tests/core/test_telemetry.py::test_detect_hosted_fallback_import_error failure is unrelated and reproduces on the prior commit. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../packages/core/agent_framework/_agents.py | 43 - .../core/agent_framework/_evaluation.py | 134 --- .../agent_framework/_workflows/_workflow.py | 49 - .../core/agent_framework/foundry/__init__.py | 3 - .../core/agent_framework/foundry/__init__.pyi | 6 - .../core/tests/core/test_local_eval.py | 200 ---- .../agent_framework_foundry/__init__.py | 20 - .../agent_framework_foundry/_evals_config.py | 403 -------- .../agent_framework_foundry/_foundry_evals.py | 869 +----------------- .../foundry/tests/test_evals_config.py | 273 ------ .../foundry/tests/test_foundry_evals.py | 755 +-------------- .../evaluation/foundry_evals/README.md | 43 +- .../evaluate_with_generated_rubric_sample.py | 151 --- .../evaluate_with_manual_rubric_sample.py | 172 ---- .../evaluation/foundry_evals/evaluators.yaml | 11 - 15 files changed, 44 insertions(+), 3088 deletions(-) delete mode 100644 python/packages/foundry/agent_framework_foundry/_evals_config.py delete mode 100644 python/packages/foundry/tests/test_evals_config.py delete mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py delete mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py delete mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml diff --git a/python/packages/core/agent_framework/_agents.py b/python/packages/core/agent_framework/_agents.py index 65506cadc6f..585898ae523 100644 --- a/python/packages/core/agent_framework/_agents.py +++ b/python/packages/core/agent_framework/_agents.py @@ -444,49 +444,6 @@ def get_session(self, service_session_id: str, *, session_id: str | None = None) """ return AgentSession(session_id=session_id, service_session_id=service_session_id) - def as_eval_source( - self, - *, - include_instructions: bool = True, - include_tools: bool = True, - include_context_providers: bool = False, - include_examples: bool = False, - examples: Sequence[str] | None = None, - ) -> str: - """Render this agent as a textual dossier for rubric-evaluator generation. - - Packages the agent's name, description, instructions, tool - definitions, and optional context-provider class names into a - single plain-text dossier suitable for passing to a rubric - generation pipeline (e.g. ``FoundryEvals.generate_rubric``). - - Defaults are conservative: instructions and tools are included; - examples and context-provider class names are not. - - Keyword Args: - include_instructions: Whether to include the agent's - instructions text. - include_tools: Whether to include tool definitions. - include_context_providers: Whether to include attached - context-provider class names. - include_examples: Whether to include the supplied ``examples``. - examples: Sample queries / interactions to include when - ``include_examples`` is true. - - Returns: - A plain-text dossier describing the agent. - """ - from ._evaluation import _render_agent_dossier # pyright: ignore[reportPrivateUsage] - - return _render_agent_dossier( - self, - include_instructions=include_instructions, - include_tools=include_tools, - include_context_providers=include_context_providers, - include_examples=include_examples, - examples=examples, - ) - async def _run_after_providers( self, *, diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py index b14bdee9b22..52bdf90d0fc 100644 --- a/python/packages/core/agent_framework/_evaluation.py +++ b/python/packages/core/agent_framework/_evaluation.py @@ -673,140 +673,6 @@ class RubricScore: reason: str -# endregion - -# region Eval source rendering - - -def _render_agent_dossier( - agent: Any, - *, - include_instructions: bool, - include_tools: bool, - include_context_providers: bool, - include_examples: bool, - examples: Sequence[str] | None, -) -> str: - """Render a structured, plain-text dossier of an agent for rubric generation.""" - lines: list[str] = [] - name = getattr(agent, "name", None) or "" - description = getattr(agent, "description", None) - lines.append(f"Agent name: {name}") - if description: - lines.append(f"Description: {description}") - - if include_instructions: - instructions: str | None = None - default_options: Any = getattr(agent, "default_options", None) - if isinstance(default_options, dict): - raw_instr: Any = cast("dict[str, Any]", default_options).get("instructions") - if isinstance(raw_instr, str) and raw_instr.strip(): - instructions = raw_instr - if instructions is None: - raw_instr = getattr(agent, "instructions", None) - if isinstance(raw_instr, str) and raw_instr.strip(): - instructions = raw_instr - if instructions: - lines.append("") - lines.append("Instructions:") - lines.append(instructions.strip()) - - if include_tools: - tool_defs = AgentEvalConverter.extract_tools(agent) - if tool_defs: - lines.append("") - lines.append("Tools:") - for tool in tool_defs: - tool_line = f"- {tool['name']}" - tool_desc = tool.get("description") - if tool_desc: - tool_line += f": {tool_desc}" - lines.append(tool_line) - params = tool.get("parameters") - if params: - try: - params_json = json.dumps(params, sort_keys=True) - except (TypeError, ValueError): - params_json = str(params) - lines.append(f" parameters: {params_json}") - - if include_context_providers: - providers = getattr(agent, "context_providers", None) - if providers: - lines.append("") - lines.append("Context providers:") - for provider in providers: - lines.append(f"- {type(provider).__name__}") - - if include_examples and examples: - lines.append("") - lines.append("Examples:") - for idx, example in enumerate(examples, start=1): - lines.append(f"{idx}. {example}") - - return "\n".join(lines).strip() - - -def _render_workflow_dossier( # pyright: ignore[reportUnusedFunction] - workflow: Workflow, - *, - include_instructions: bool, - include_tools: bool, - include_context_providers: bool, - include_examples: bool, - examples: Sequence[str] | None, - include_topology: bool, -) -> str: - """Render a structured, plain-text dossier of a workflow for rubric generation.""" - from ._workflows._agent_executor import AgentExecutor as _AE - - lines: list[str] = [] - name = workflow.name or "" - lines.append(f"Workflow name: {name}") - if workflow.description: - lines.append(f"Description: {workflow.description}") - - if include_topology: - try: - topology = json.dumps(workflow.to_dict(), sort_keys=True, default=str) - except (TypeError, ValueError) as exc: - logger.debug("Workflow.to_dict() failed during eval source export: %s", exc) - topology = None - if topology: - lines.append("") - lines.append("Topology (JSON):") - lines.append(topology) - - agent_executors: list[tuple[str, Any]] = [] - for executor_id, executor in workflow.executors.items(): - if isinstance(executor, _AE): - agent_executors.append((executor_id, executor.agent)) - - if agent_executors: - lines.append("") - lines.append("Agents:") - for executor_id, agent in agent_executors: - lines.append("") - lines.append(f"Executor: {executor_id}") - dossier = _render_agent_dossier( - agent, - include_instructions=include_instructions, - include_tools=include_tools, - include_context_providers=include_context_providers, - include_examples=False, - examples=None, - ) - lines.append(dossier) - - if include_examples and examples: - lines.append("") - lines.append("Examples:") - for idx, example in enumerate(examples, start=1): - lines.append(f"{idx}. {example}") - - return "\n".join(lines).strip() - - # endregion # region Evaluator protocol diff --git a/python/packages/core/agent_framework/_workflows/_workflow.py b/python/packages/core/agent_framework/_workflows/_workflow.py index bce7569ef1a..0493cd015f3 100644 --- a/python/packages/core/agent_framework/_workflows/_workflow.py +++ b/python/packages/core/agent_framework/_workflows/_workflow.py @@ -410,55 +410,6 @@ def to_json(self) -> str: """Serialize the workflow definition to JSON.""" return json.dumps(self.to_dict()) - def as_eval_source( - self, - *, - include_instructions: bool = True, - include_tools: bool = True, - include_context_providers: bool = False, - include_examples: bool = False, - examples: Sequence[str] | None = None, - include_topology: bool = True, - ) -> str: - """Render this workflow as a textual dossier for rubric-evaluator generation. - - Produces a plain-text dossier containing the workflow's name, - description, optional JSON-encoded topology (from - :meth:`Workflow.to_dict`), and per-agent dossiers extracted from - ``AgentExecutor`` nodes. Suitable for passing to a rubric - generation pipeline (e.g. ``FoundryEvals.generate_rubric``). - - Defaults are conservative: per-agent instructions and tools are - included, plus the JSON-encoded topology. Examples and - context-provider class names are excluded by default. - - Keyword Args: - include_instructions: Per-agent instructions inclusion. - include_tools: Per-agent tool-definition inclusion. - include_context_providers: Per-agent context-provider - inclusion. - include_examples: Whether to include workflow-level - ``examples``. - examples: Sample queries / interactions to include when - ``include_examples`` is true. - include_topology: Whether to embed the JSON-encoded workflow - topology in the rendered dossier. - - Returns: - A plain-text dossier describing the workflow. - """ - from .._evaluation import _render_workflow_dossier # pyright: ignore[reportPrivateUsage] - - return _render_workflow_dossier( - self, - include_instructions=include_instructions, - include_tools=include_tools, - include_context_providers=include_context_providers, - include_examples=include_examples, - examples=examples, - include_topology=include_topology, - ) - def get_start_executor(self) -> Executor: """Get the starting executor of the workflow. diff --git a/python/packages/core/agent_framework/foundry/__init__.py b/python/packages/core/agent_framework/foundry/__init__.py index 2abd2b58e90..4fe624cc169 100644 --- a/python/packages/core/agent_framework/foundry/__init__.py +++ b/python/packages/core/agent_framework/foundry/__init__.py @@ -40,11 +40,8 @@ "RawFoundryAgentChatClient": ("agent_framework_foundry", "agent-framework-foundry"), "RawFoundryChatClient": ("agent_framework_foundry", "agent-framework-foundry"), "RawFoundryEmbeddingClient": ("agent_framework_foundry", "agent-framework-foundry"), - "RubricDimension": ("agent_framework_foundry", "agent-framework-foundry"), - "build_sources": ("agent_framework_foundry", "agent-framework-foundry"), "evaluate_foundry_target": ("agent_framework_foundry", "agent-framework-foundry"), "evaluate_traces": ("agent_framework_foundry", "agent-framework-foundry"), - "load_evals_config": ("agent_framework_foundry", "agent-framework-foundry"), } diff --git a/python/packages/core/agent_framework/foundry/__init__.pyi b/python/packages/core/agent_framework/foundry/__init__.pyi index abcf45868f2..145ea48087d 100644 --- a/python/packages/core/agent_framework/foundry/__init__.pyi +++ b/python/packages/core/agent_framework/foundry/__init__.pyi @@ -25,11 +25,8 @@ from agent_framework_foundry import ( RawFoundryAgentChatClient, RawFoundryChatClient, RawFoundryEmbeddingClient, - RubricDimension, - build_sources, evaluate_foundry_target, evaluate_traces, - load_evals_config, ) from agent_framework_foundry_local import ( FoundryLocalChatOptions, @@ -61,9 +58,6 @@ __all__ = [ "RawFoundryAgentChatClient", "RawFoundryChatClient", "RawFoundryEmbeddingClient", - "RubricDimension", - "build_sources", "evaluate_foundry_target", "evaluate_traces", - "load_evals_config", ] diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py index e4c37dfb4b4..e60fb35d514 100644 --- a/python/packages/core/tests/core/test_local_eval.py +++ b/python/packages/core/tests/core/test_local_eval.py @@ -5,7 +5,6 @@ from __future__ import annotations import inspect -from typing import Any import pytest @@ -1114,202 +1113,3 @@ def test_evaluator_filter_isolates_offenders(self) -> None: ) # The low-scoring "other" evaluator is filtered out; "policy" passes. results.assert_dimension_score_at_least("clarity", 3, evaluator="policy") - - -# --------------------------------------------------------------------------- -# Eval source rendering (string dossiers) -# --------------------------------------------------------------------------- - - -class TestAgentAsEvalSource: - """Tests for BaseAgent.as_eval_source / _render_agent_dossier.""" - - def _make_mock_agent( - self, - *, - name: str = "weather-bot", - description: str | None = "Looks up the weather.", - instructions: str | None = "Be concise. Always cite the source.", - tools: list[Any] | None = None, - context_providers: list[Any] | None = None, - mcp_tools: list[Any] | None = None, - ) -> Any: - from unittest.mock import MagicMock - - from agent_framework._tools import ai_function - - agent = MagicMock() - agent.name = name - agent.description = description - agent.default_options = {"instructions": instructions, "tools": tools or []} - agent.context_providers = context_providers or [] - agent.mcp_tools = mcp_tools or [] - if tools: - normalized: list[Any] = [] - for t in tools: - if callable(t) and not hasattr(t, "parameters"): - normalized.append(ai_function(t)) - else: - normalized.append(t) - agent.default_options["tools"] = normalized - return agent - - def _render(self, agent: Any, **overrides: Any) -> str: - from agent_framework._evaluation import _render_agent_dossier - - kwargs: dict[str, Any] = { - "include_instructions": True, - "include_tools": True, - "include_context_providers": False, - "include_examples": False, - "examples": None, - } - kwargs.update(overrides) - return _render_agent_dossier(agent, **kwargs) - - def test_basic_dossier_includes_name_and_instructions(self): - agent = self._make_mock_agent() - dossier = self._render(agent) - assert isinstance(dossier, str) - assert "Agent name: weather-bot" in dossier - assert "Description: Looks up the weather." in dossier - assert "Instructions:" in dossier - assert "Be concise." in dossier - - def test_tools_section_includes_definitions(self): - def get_weather(city: str) -> str: - """Return the current weather for *city*.""" - return f"sunny in {city}" - - agent = self._make_mock_agent(tools=[get_weather]) - dossier = self._render(agent) - assert "Tools:" in dossier - assert "- get_weather" in dossier - assert '"city"' in dossier - - def test_include_instructions_false_omits_section(self): - agent = self._make_mock_agent() - dossier = self._render(agent, include_instructions=False) - assert "Instructions:" not in dossier - - def test_include_tools_false_omits_section(self): - def get_weather(city: str) -> str: - return f"sunny in {city}" - - agent = self._make_mock_agent(tools=[get_weather]) - dossier = self._render(agent, include_tools=False) - assert "Tools:" not in dossier - - def test_context_providers_excluded_by_default_but_included_when_opted_in(self): - class StubProvider: - pass - - agent = self._make_mock_agent(context_providers=[StubProvider()]) - default_dossier = self._render(agent) - assert "Context providers:" not in default_dossier - - opt_in_dossier = self._render(agent, include_context_providers=True) - assert "Context providers:" in opt_in_dossier - assert "- StubProvider" in opt_in_dossier - - def test_examples_excluded_by_default_but_included_when_opted_in(self): - agent = self._make_mock_agent() - default_dossier = self._render(agent, examples=["What's the weather in NYC?"]) - assert "Examples:" not in default_dossier - - opt_in_dossier = self._render( - agent, - include_examples=True, - examples=["What's the weather in NYC?"], - ) - assert "Examples:" in opt_in_dossier - assert "What's the weather in NYC?" in opt_in_dossier - - def test_base_agent_method_returns_dossier_string(self): - from agent_framework._agents import BaseAgent - - class _ConcreteAgent(BaseAgent): - pass - - agent = _ConcreteAgent(name="test-agent", description="A test agent.") - dossier = agent.as_eval_source() - assert isinstance(dossier, str) - assert "Agent name: test-agent" in dossier - - -class TestWorkflowAsEvalSource: - """Tests for Workflow.as_eval_source / _render_workflow_dossier.""" - - def _build_workflow(self, *, with_agent: bool = False) -> Any: - from unittest.mock import MagicMock - - from agent_framework._workflows._agent_executor import AgentExecutor - - workflow = MagicMock() - workflow.name = "demo-workflow" - workflow.description = "Routes user questions through a single agent." - workflow.to_dict.return_value = { - "name": "demo-workflow", - "id": "wf_1", - "start_executor_id": "agent_1", - "edge_groups": [], - "executors": {"agent_1": {"type": "AgentExecutor"}}, - } - - if with_agent: - inner_agent = MagicMock() - inner_agent.name = "inner-agent" - inner_agent.description = "Inner agent." - inner_agent.default_options = {"instructions": "Answer politely.", "tools": []} - inner_agent.context_providers = [] - inner_agent.mcp_tools = [] - - executor = MagicMock(spec=AgentExecutor) - executor.agent = inner_agent - workflow.executors = {"agent_1": executor} - else: - workflow.executors = {} - return workflow - - def _render(self, workflow: Any, **overrides: Any) -> str: - from agent_framework._evaluation import _render_workflow_dossier - - kwargs: dict[str, Any] = { - "include_instructions": True, - "include_tools": True, - "include_context_providers": False, - "include_examples": False, - "examples": None, - "include_topology": True, - } - kwargs.update(overrides) - return _render_workflow_dossier(workflow, **kwargs) - - def test_emits_dossier_with_topology(self): - workflow = self._build_workflow() - dossier = self._render(workflow) - assert isinstance(dossier, str) - assert "Workflow name: demo-workflow" in dossier - assert "Topology (JSON):" in dossier - assert '"start_executor_id": "agent_1"' in dossier - - def test_topology_can_be_disabled(self): - workflow = self._build_workflow() - dossier = self._render(workflow, include_topology=False) - assert "Topology (JSON):" not in dossier - - def test_per_agent_dossiers_included_when_executor_is_agent_executor(self): - workflow = self._build_workflow(with_agent=True) - dossier = self._render(workflow) - assert "Agents:" in dossier - assert "Executor: agent_1" in dossier - assert "Agent name: inner-agent" in dossier - assert "Answer politely." in dossier - - def test_workflow_examples_excluded_by_default(self): - workflow = self._build_workflow() - default_dossier = self._render(workflow, examples=["Hi"]) - assert "Examples:" not in default_dossier - - opt_in_dossier = self._render(workflow, examples=["Hi"], include_examples=True) - assert "Examples:" in opt_in_dossier diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py index efbe0b8d248..1e40fbc68f6 100644 --- a/python/packages/foundry/agent_framework_foundry/__init__.py +++ b/python/packages/foundry/agent_framework_foundry/__init__.py @@ -10,22 +10,11 @@ FoundryEmbeddingSettings, RawFoundryEmbeddingClient, ) -from ._evals_config import ( - RubricGenerationSpec, - RubricSourceSpec, - build_sources, - load_evals_config, - parse_evals_config, -) from ._foundry_evals import ( - EvalGenerationSource, FoundryEvals, GeneratedEvaluatorRef, - RubricDimension, - agent_as_eval_source, evaluate_foundry_target, evaluate_traces, - workflow_as_eval_source, ) from ._memory_provider import FoundryMemoryProvider @@ -35,7 +24,6 @@ __version__ = "0.0.0" __all__ = [ - "EvalGenerationSource", "FoundryAgent", "FoundryAgentOptions", "FoundryChatClient", @@ -50,15 +38,7 @@ "RawFoundryAgentChatClient", "RawFoundryChatClient", "RawFoundryEmbeddingClient", - "RubricDimension", - "RubricGenerationSpec", - "RubricSourceSpec", "__version__", - "agent_as_eval_source", - "build_sources", "evaluate_foundry_target", "evaluate_traces", - "load_evals_config", - "parse_evals_config", - "workflow_as_eval_source", ] diff --git a/python/packages/foundry/agent_framework_foundry/_evals_config.py b/python/packages/foundry/agent_framework_foundry/_evals_config.py deleted file mode 100644 index 5f45e2854b8..00000000000 --- a/python/packages/foundry/agent_framework_foundry/_evals_config.py +++ /dev/null @@ -1,403 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -"""YAML-driven evaluator configuration for rubric generation and evaluation. - -Defines the source-controlled config schema described in -``adaptive-evals-draft.md``: a list of named rubric-generation specs that -CI jobs and harnesses parse to drive -:meth:`FoundryEvals.generate_rubric`. - -Example config: - -.. code-block:: yaml - - evaluators: - reservation-agent-quality: - type: foundry.generated_rubric - category: quality - model: gpt-4o - agent: reservation-agent - sources: - - type: agent - include_instructions: true - include_tools: true - - type: dataset - name: reservation-business-rules - version: "1" - -Example loader usage: - -.. code-block:: python - - from agent_framework_foundry import load_evals_config, FoundryEvals - - config = load_evals_config("evaluators.yaml") - spec = config["reservation-agent-quality"] - sources = build_sources(spec, agent=agent) - ref = await FoundryEvals.generate_rubric( - project_client=client, - name=spec.name, - sources=sources, - category=spec.category, - model=spec.model, - display_name=spec.display_name, - description=spec.description, - ) -""" - -from __future__ import annotations - -import os -from collections.abc import Mapping -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Literal, cast - -from agent_framework._feature_stage import ExperimentalFeature, experimental - -from ._foundry_evals import ( - EvalGenerationSource, - agent_as_eval_source, - workflow_as_eval_source, -) - -_RUBRIC_TYPE = "foundry.generated_rubric" - - -@experimental(feature_id=ExperimentalFeature.EVALS) -@dataclass(frozen=True) -class RubricSourceSpec: - """A single source entry in a :class:`RubricGenerationSpec` ``sources`` list. - - Mirrors the per-source YAML schema. The :attr:`type` field is the - discriminator; only the fields relevant to each type are read. - - Attributes: - type: One of ``"agent"``, ``"workflow"``, ``"prompt"``, - ``"dataset"``, ``"traces"``. - description: Optional description shown in Foundry UI. - include_instructions: Whether to include the bound agent / - workflow's instructions. Applies to ``"agent"`` and - ``"workflow"`` types. - include_tools: Whether to include the bound agent / workflow's - tools. Applies to ``"agent"`` and ``"workflow"`` types. - include_context_providers: Whether to include attached - context-provider class names. Applies to ``"agent"`` and - ``"workflow"`` types. - include_examples: Whether to include ``examples``. Applies to - ``"agent"`` and ``"workflow"`` types. - include_topology: Whether to include the JSON-encoded topology. - Applies to ``"workflow"`` type. - examples: Optional list of example queries for ``"agent"`` / - ``"workflow"`` sources. - prompt: Rendered dossier for ``"prompt"`` type. - agent_name: Hosted Foundry agent name for ``"agent"`` type with - a server-side reference. - name: Dataset name for ``"dataset"`` type. - version: Pinned dataset version. - metadata: Free-form metadata for ``"traces"`` sources. - """ - - type: Literal["agent", "workflow", "prompt", "dataset", "traces"] - description: str | None = None - include_instructions: bool = True - include_tools: bool = True - include_context_providers: bool = False - include_examples: bool = False - include_topology: bool = True - examples: tuple[str, ...] = field(default_factory=tuple) - prompt: str | None = None - agent_name: str | None = None - name: str | None = None - version: str | None = None - metadata: dict[str, Any] | None = None - - -@experimental(feature_id=ExperimentalFeature.EVALS) -@dataclass(frozen=True) -class RubricGenerationSpec: - """A single named entry from an evaluators YAML config. - - Attributes: - name: Evaluator name (the YAML key under ``evaluators``). - type: Discriminator literal. Must be - ``"foundry.generated_rubric"`` for rubric evaluators. - category: ``"quality"`` or ``"safety"``. - model: Optional model deployment to drive generation. - agent: Optional symbolic reference to the agent in the - caller's harness. Resolved by user code into a - :class:`BaseAgent` and passed to - :func:`build_sources`. - workflow: Optional symbolic reference to a workflow. - display_name: Optional human-readable name. - description: Optional description. - sources: List of source specs to feed into generation. When - empty, callers typically default to a single - ``RubricSourceSpec(type='agent')`` or - ``RubricSourceSpec(type='workflow')`` source. - """ - - name: str - type: str = _RUBRIC_TYPE - category: Literal["quality", "safety"] = "quality" - model: str | None = None - agent: str | None = None - workflow: str | None = None - display_name: str | None = None - description: str | None = None - sources: tuple[RubricSourceSpec, ...] = field(default_factory=tuple) - - -@experimental(feature_id=ExperimentalFeature.EVALS) -def load_evals_config(path: str | os.PathLike[str]) -> dict[str, RubricGenerationSpec]: - """Load a YAML evaluators config and return a name -> spec mapping. - - Reads ``path`` (UTF-8) and parses the top-level ``evaluators`` - mapping into :class:`RubricGenerationSpec` instances keyed by name. - - Requires ``PyYAML``. Raises :class:`ImportError` with a helpful - message when PyYAML is not installed. - - Args: - path: Filesystem path to the YAML config. - - Returns: - A dict mapping evaluator name to :class:`RubricGenerationSpec`. - - Raises: - ImportError: If PyYAML is not installed. - ValueError: If the YAML file is malformed. - """ - try: - import yaml # type: ignore[import-untyped] - except ImportError as exc: - raise ImportError("load_evals_config requires PyYAML. Install with `pip install pyyaml`.") from exc - - raw = yaml.safe_load(Path(path).read_text(encoding="utf-8")) - return parse_evals_config(raw) - - -@experimental(feature_id=ExperimentalFeature.EVALS) -def parse_evals_config(data: Any) -> dict[str, RubricGenerationSpec]: - """Parse an already-loaded YAML mapping into rubric-generation specs. - - Useful when callers manage YAML loading themselves (e.g. CI that - interpolates env vars before parsing). - - Args: - data: A mapping with an ``"evaluators"`` key containing a mapping - of evaluator names to spec dicts. - - Returns: - A dict mapping evaluator name to :class:`RubricGenerationSpec`. - - Raises: - ValueError: If the structure is malformed. - """ - if not isinstance(data, Mapping): - raise ValueError("Evaluators config must be a mapping.") - data_map = cast("Mapping[str, Any]", data) - raw_evaluators = data_map.get("evaluators") - if raw_evaluators is None: - raise ValueError("Evaluators config is missing a top-level 'evaluators' key.") - if not isinstance(raw_evaluators, Mapping): - raise ValueError("Evaluators config 'evaluators' entry must be a mapping.") - evaluators = cast("Mapping[str, Any]", raw_evaluators) - - parsed: dict[str, RubricGenerationSpec] = {} - for name, raw in evaluators.items(): - if not isinstance(raw, Mapping): - raise ValueError(f"Evaluator entry {name!r} must be a mapping, got {type(raw).__name__}.") - raw_map = cast("Mapping[str, Any]", raw) - parsed[name] = _parse_spec(name, raw_map) - return parsed - - -def _parse_spec(name: str, raw: Mapping[str, Any]) -> RubricGenerationSpec: - type_value = raw.get("type", _RUBRIC_TYPE) - if type_value != _RUBRIC_TYPE: - raise ValueError(f"Evaluator {name!r} has unsupported type {type_value!r}; expected {_RUBRIC_TYPE!r}.") - category = raw.get("category", "quality") - if category not in ("quality", "safety"): - raise ValueError(f"Evaluator {name!r} has invalid category {category!r}; expected 'quality' or 'safety'.") - - raw_sources_obj: Any = raw.get("sources") or () - if not isinstance(raw_sources_obj, (list, tuple)): - raise ValueError(f"Evaluator {name!r} 'sources' must be a list.") - sources_iter: list[Any] = list(cast("Any", raw_sources_obj)) - sources: list[RubricSourceSpec] = [] - for index, raw_source in enumerate(sources_iter): - if not isinstance(raw_source, Mapping): - raise ValueError( - f"Evaluator {name!r} source entry {index} must be a mapping, got {type(raw_source).__name__}." - ) - sources.append(_parse_source(name, index, cast("Mapping[str, Any]", raw_source))) - - return RubricGenerationSpec( - name=name, - type=type_value, - category=category, - model=raw.get("model"), - agent=raw.get("agent"), - workflow=raw.get("workflow"), - display_name=raw.get("display_name"), - description=raw.get("description"), - sources=tuple(sources), - ) - - -def _parse_source(spec_name: str, index: int, raw: Mapping[str, Any]) -> RubricSourceSpec: - type_value = raw.get("type") - if type_value not in ("agent", "workflow", "prompt", "dataset", "traces"): - raise ValueError( - f"Evaluator {spec_name!r} source {index} has invalid type {type_value!r}; " - "expected one of 'agent', 'workflow', 'prompt', 'dataset', 'traces'." - ) - - examples_raw: Any = raw.get("examples") or () - if not isinstance(examples_raw, (list, tuple)): - raise ValueError(f"Evaluator {spec_name!r} source {index} 'examples' must be a list.") - examples_iter: list[Any] = list(cast("Any", examples_raw)) - examples = tuple(str(e) for e in examples_iter) - - metadata_raw = raw.get("metadata") - if metadata_raw is not None and not isinstance(metadata_raw, Mapping): - raise ValueError(f"Evaluator {spec_name!r} source {index} 'metadata' must be a mapping.") - - return RubricSourceSpec( - type=cast("Any", type_value), - description=raw.get("description"), - include_instructions=bool(raw.get("include_instructions", True)), - include_tools=bool(raw.get("include_tools", True)), - include_context_providers=bool(raw.get("include_context_providers", False)), - include_examples=bool(raw.get("include_examples", False)), - include_topology=bool(raw.get("include_topology", True)), - examples=examples, - prompt=raw.get("prompt"), - agent_name=raw.get("agent_name"), - name=raw.get("name"), - version=str(raw.get("version")) if raw.get("version") is not None else None, - metadata=dict(cast("Mapping[str, Any]", metadata_raw)) if metadata_raw is not None else None, - ) - - -@experimental(feature_id=ExperimentalFeature.EVALS) -def build_sources( - spec: RubricGenerationSpec, - *, - agent: Any | None = None, - workflow: Any | None = None, -) -> list[EvalGenerationSource]: - """Translate a spec's source list into :class:`EvalGenerationSource` instances. - - Resolves each :class:`RubricSourceSpec` against the supplied - ``agent`` and ``workflow`` instances: - - * ``type='agent'`` sources call :func:`agent_as_eval_source` with - the spec's include-flags. If the source carries an - ``agent_name`` the agent is referenced server-side instead. - * ``type='workflow'`` sources call - :func:`workflow_as_eval_source` with the spec's include-flags. - * ``type='prompt'``, ``type='dataset'``, and ``type='traces'`` - sources are translated directly into - :class:`EvalGenerationSource` instances without consulting the - runtime agent or workflow. - - When the spec has no ``sources`` entries, defaults to a single - ``type='agent'`` source when an ``agent`` is provided, or a single - ``type='workflow'`` source when a ``workflow`` is provided. - - Args: - spec: Parsed :class:`RubricGenerationSpec`. - agent: Optional agent instance for ``type='agent'`` sources. - workflow: Optional workflow instance for ``type='workflow'`` - sources. - - Returns: - A list of :class:`EvalGenerationSource` instances ready to pass - to :meth:`FoundryEvals.generate_rubric` as ``sources=``. - - Raises: - ValueError: If a source references an agent or workflow that - was not supplied. - """ - if not spec.sources: - if agent is not None: - return [agent_as_eval_source(agent)] - if workflow is not None: - return [workflow_as_eval_source(workflow)] - raise ValueError(f"Spec {spec.name!r} has no sources and no agent/workflow was provided to build_sources().") - - out: list[EvalGenerationSource] = [] - for src in spec.sources: - if src.type == "agent": - if src.agent_name: - out.append( - EvalGenerationSource( - type="agent", - agent_name=src.agent_name, - description=src.description, - ) - ) - continue - if agent is None: - raise ValueError(f"Spec {spec.name!r} has a source of type 'agent' but no agent= was provided.") - out.append( - agent_as_eval_source( - agent, - include_instructions=src.include_instructions, - include_tools=src.include_tools, - include_context_providers=src.include_context_providers, - include_examples=src.include_examples, - examples=list(src.examples) if src.examples else None, - ) - ) - elif src.type == "workflow": - if workflow is None: - raise ValueError(f"Spec {spec.name!r} has a source of type 'workflow' but no workflow= was provided.") - out.append( - workflow_as_eval_source( - workflow, - include_instructions=src.include_instructions, - include_tools=src.include_tools, - include_context_providers=src.include_context_providers, - include_examples=src.include_examples, - examples=list(src.examples) if src.examples else None, - include_topology=src.include_topology, - ) - ) - elif src.type == "prompt": - if not src.prompt: - raise ValueError(f"Spec {spec.name!r} has a 'prompt' source missing the 'prompt' field.") - out.append(EvalGenerationSource(type="prompt", prompt=src.prompt, description=src.description)) - elif src.type == "dataset": - if not src.name: - raise ValueError(f"Spec {spec.name!r} has a 'dataset' source missing the 'name' field.") - out.append( - EvalGenerationSource( - type="dataset", - dataset_name=src.name, - dataset_version=src.version, - description=src.description, - ) - ) - elif src.type == "traces": - out.append( - EvalGenerationSource( - type="traces", - description=src.description, - metadata=src.metadata, - ) - ) - else: # pragma: no cover - guarded by _parse_source - raise ValueError(f"Spec {spec.name!r} has unknown source type {src.type!r}.") - return out - - -__all__ = [ - "RubricGenerationSpec", - "RubricSourceSpec", - "build_sources", - "load_evals_config", - "parse_evals_config", -] diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index 7fddd64c38c..f242db06d91 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -30,7 +30,7 @@ import logging from collections.abc import Iterable, Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import TYPE_CHECKING, Any, cast from agent_framework._evaluation import ( AgentEvalConverter, @@ -48,332 +48,56 @@ from ._chat_client import FoundryChatClient if TYPE_CHECKING: - from agent_framework._agents import BaseAgent - from agent_framework._workflows._workflow import Workflow from azure.ai.projects.aio import AIProjectClient from openai.types.evals import RunRetrieveResponse logger = logging.getLogger(__name__) -# region Generated rubric evaluator types - - -@experimental(feature_id=ExperimentalFeature.EVALS) -@dataclass(frozen=True) -class RubricDimension: - """A single dimension of a generated rubric evaluator. - - Rubric evaluators score each item along one or more named dimensions, - each with its own description and weight. Foundry's evaluator - generation pipeline produces these dimensions from agent/workflow - metadata; ``RubricDimension`` surfaces them so callers can inspect a - generated evaluator's structure without round-tripping through the - portal. - - Attributes: - id: Stable identifier for the dimension (e.g. ``"policy_enforcement"``). - description: Natural-language description of what the dimension scores. - weight: Integer weight controlling the dimension's contribution to - the aggregate score. - always_applicable: When ``False``, evaluators may mark this - dimension non-applicable on a per-item basis. - """ - - id: str - description: str - weight: int - always_applicable: bool = False +# region Generated rubric evaluator references @experimental(feature_id=ExperimentalFeature.EVALS) @dataclass(frozen=True) class GeneratedEvaluatorRef: - """A reference to a generated rubric evaluator stored in Foundry. + """A reference to a rubric evaluator that already exists in Foundry. Pass instances of this class to :class:`FoundryEvals` to score items - with a previously generated rubric evaluator. Construct directly - when the evaluator already exists, or obtain one from - :meth:`FoundryEvals.generate_rubric`. + with a pre-existing rubric evaluator (manually authored or + auto-generated through the Foundry portal). agent-framework is a + consumer here: it does not create or modify the evaluator definition; + it only references the persisted version by name. Pinning ``version`` is strongly recommended so evaluation runs are - reproducible. The dataclass accepts ``version=None`` for the - convenience of :meth:`latest`, but ``FoundryEvals`` emits a warning - whenever a versionless reference is used; CI gates should always - pass a concrete version. + reproducible. ``version=None`` resolves to whichever version is + current at execution time; :class:`FoundryEvals` emits a warning when + a versionless reference is used. CI gates should always pass a + concrete version. Attributes: - name: Evaluator name as stored in the Foundry project (e.g. - ``"my-policy-evaluator"``). Distinct from built-in - evaluators such as ``"builtin.relevance"``. + name: Evaluator name as stored in the Foundry project (for + example ``"reservation-policy-rubric"``). Distinct from + built-in evaluators such as ``"builtin.relevance"``. version: Pinned evaluator version. ``None`` means "latest" — - this is discouraged for CI/repro and ``FoundryEvals`` will - emit a warning when used. - category: ``"quality"`` for ungrounded rubric scoring, - ``"safety"`` for safety-focused evaluators. Matches the - Foundry evaluator's declared category. + this is discouraged for CI/repro and :class:`FoundryEvals` + will emit a warning when used. display_name: Optional human-readable name used in result summaries. Defaults to ``name`` when unset. - description: Optional description carried over from the - generated evaluator definition for documentation. - dimensions: Optional snapshot of the rubric's dimensions for - inspection. Not required to invoke the evaluator — the - service uses the persisted definition. - pass_threshold: Optional aggregate score threshold (0.0-1.0) the - evaluator considers a passing item. ``None`` defers to the - evaluator's stored default. """ name: str version: str | None = None - category: Literal["quality", "safety"] = "quality" display_name: str | None = None - description: str | None = None - dimensions: tuple[RubricDimension, ...] | None = None - pass_threshold: float | None = None @classmethod - def latest( - cls, - name: str, - *, - category: Literal["quality", "safety"] = "quality", - display_name: str | None = None, - description: str | None = None, - ) -> GeneratedEvaluatorRef: + def latest(cls, name: str, *, display_name: str | None = None) -> GeneratedEvaluatorRef: """Construct a versionless reference (resolves to the latest version at run time). Discouraged for reproducible runs. Prefer the constructor with an explicit ``version`` so CI and replay evaluations stay stable - when the evaluator is regenerated. + when the evaluator is updated in Foundry. """ - return cls( - name=name, - version=None, - category=category, - display_name=display_name, - description=description, - ) - - -@experimental(feature_id=ExperimentalFeature.EVALS) -@dataclass(frozen=True) -class EvalGenerationSource: - """A source description passed to Foundry's evaluator generation pipeline. - - Rubric evaluator generation consumes one or more sources that describe - the agent or workflow under evaluation. ``FoundryEvals`` translates - instances into the underlying ``*EvaluatorGenerationJobSource`` SDK - types. - - Discriminated by :attr:`type`: - - * ``"prompt"`` - a free-form textual dossier (typical for local agents - and workflows whose tools cannot be fetched server-side). - * ``"agent"`` - a hosted Foundry agent referenced by name so the - service fetches tool definitions and metadata directly. - * ``"dataset"`` - a Foundry dataset of recorded interactions. - * ``"traces"`` - tracing data scoped by metadata. - - Only the fields relevant to :attr:`type` are populated; the remaining - fields stay ``None``. - - Attributes: - type: Source kind. See discriminator above. - description: Optional short description shown in Foundry UI. - prompt: Rendered dossier for ``type="prompt"`` sources. - agent_name: Hosted Foundry agent name for ``type="agent"`` sources. - agent_version: Optional pinned hosted-agent version for - ``type="agent"`` sources. ``None`` resolves to the latest - version at generation time; pin for reproducible runs. - dataset_name: Foundry dataset name for ``type="dataset"`` sources. - dataset_version: Pinned dataset version (recommended for repro). - metadata: Free-form metadata. Used by ``type="traces"`` sources - for tracing-attribute filters and as a generic escape hatch - for additional fields not yet modeled. - """ - - type: Literal["prompt", "dataset", "agent", "traces"] - description: str | None = None - prompt: str | None = None - agent_name: str | None = None - agent_version: str | None = None - dataset_name: str | None = None - dataset_version: str | None = None - metadata: dict[str, Any] | None = None - - -@experimental(feature_id=ExperimentalFeature.EVALS) -def agent_as_eval_source( - agent: BaseAgent, - *, - include_instructions: bool = True, - include_tools: bool = True, - include_context_providers: bool = False, - include_examples: bool = False, - examples: Sequence[str] | None = None, - hosted_agent_name: str | None = None, - hosted_agent_version: str | None = None, - force_prompt_source: bool = False, -) -> EvalGenerationSource: - """Render an agent as an :class:`EvalGenerationSource` for rubric generation. - - Picks the best Foundry source variant for the supplied agent: - - * **Hosted Foundry agents** (``FoundryAgent`` connected to a Prompt - Agent or Hosted Agent in a Foundry project) are emitted as - ``type="agent"`` sources keyed by ``agent_name`` so the service - fetches instructions, tools, and metadata directly from the agent - registry — independent of whatever the local wrapper happens to - hold. Detected automatically from ``agent.chat_client.agent_name`` - and ``agent.chat_client.agent_version``. - * **Local agents** (any other ``BaseAgent`` whose instructions and - tools live client-side, e.g. ``FoundryChatClient``-backed agents or - pure OpenAI Responses agents) are emitted as ``type="prompt"`` - sources with a rendered text dossier. - - Override the heuristic by passing ``hosted_agent_name`` explicitly - (forces an ``"agent"`` source) or ``force_prompt_source=True`` - (forces a ``"prompt"`` source — useful when you want the service to - score a hosted agent against the *local* wrapper's overrides). - - Args: - agent: Agent instance (typically a ``BaseAgent`` subclass). - include_instructions: Whether to include the agent's instructions - text in the dossier (``"prompt"`` sources only). Defaults to - ``True``. - include_tools: Whether to include tool definitions in the dossier - (``"prompt"`` sources only). Defaults to ``True``. - include_context_providers: Whether to include the names of - attached context-provider classes in the dossier - (``"prompt"`` sources only). Defaults to ``False`` to avoid - leaking implementation details. - include_examples: Whether to include the supplied ``examples`` in - the dossier (``"prompt"`` sources only). Defaults to - ``False`` to avoid shipping potentially sensitive sample - inputs by default. - examples: Optional sample queries / interactions to include when - ``include_examples`` is ``True``. - hosted_agent_name: When set, emit a ``type="agent"`` source - referencing this hosted Foundry agent name regardless of - auto-detection. Use to override or supplement the - heuristic. - hosted_agent_version: When set together with a hosted-agent - source, pins the source to a specific hosted-agent version. - Recommended for reproducible rubric generation against - PromptAgents. - force_prompt_source: When ``True``, always emit a - ``type="prompt"`` source with the rendered dossier even when - the agent is a hosted Foundry agent. Useful when the local - wrapper holds overrides the service-side agent doesn't see. - - Returns: - An :class:`EvalGenerationSource` describing the agent. - """ - agent_description = getattr(agent, "description", None) - - resolved_name = hosted_agent_name - resolved_version = hosted_agent_version - if resolved_name is None and not force_prompt_source: - detected_name, detected_version = _detect_hosted_foundry_agent(agent) - if detected_name is not None: - resolved_name = detected_name - if resolved_version is None: - resolved_version = detected_version - - if resolved_name is not None and not force_prompt_source: - return EvalGenerationSource( - type="agent", - agent_name=resolved_name, - agent_version=resolved_version, - description=agent_description, - ) - - prompt = agent.as_eval_source( - include_instructions=include_instructions, - include_tools=include_tools, - include_context_providers=include_context_providers, - include_examples=include_examples, - examples=examples, - ) - return EvalGenerationSource( - type="prompt", - prompt=prompt, - description=agent_description, - ) - - -def _detect_hosted_foundry_agent(agent: BaseAgent) -> tuple[str | None, str | None]: - """Return ``(agent_name, agent_version)`` for hosted Foundry agents, else ``(None, None)``. - - A hosted Foundry agent is one whose ``chat_client`` exposes a string - ``agent_name`` — the convention used by ``RawFoundryAgentChatClient`` - when ``FoundryAgent`` connects to an existing Prompt Agent or Hosted - Agent in a Foundry project. Only string values are accepted so - test doubles using ``MagicMock`` for ``chat_client`` are not - mis-detected. - """ - chat_client = getattr(agent, "chat_client", None) - if chat_client is None: - return None, None - name = getattr(chat_client, "agent_name", None) - version = getattr(chat_client, "agent_version", None) - if not isinstance(name, str) or not name: - return None, None - if not isinstance(version, str) or not version: - version = None - return name, version - - -@experimental(feature_id=ExperimentalFeature.EVALS) -def workflow_as_eval_source( - workflow: Workflow, - *, - include_instructions: bool = True, - include_tools: bool = True, - include_context_providers: bool = False, - include_examples: bool = False, - examples: Sequence[str] | None = None, - include_topology: bool = True, -) -> EvalGenerationSource: - """Render a workflow as an :class:`EvalGenerationSource` for rubric generation. - - Wraps :meth:`Workflow.as_eval_source` to package the workflow's - rendered dossier (workflow name, description, topology, per-agent - dossiers) into a typed ``type="prompt"`` Foundry generation source. - - Args: - workflow: Workflow instance to render. - include_instructions: Per-agent instructions inclusion. - include_tools: Per-agent tools inclusion. - include_context_providers: Per-agent context-provider inclusion. - Defaults to ``False``. - include_examples: Per-agent examples inclusion. Defaults to - ``False``. - examples: Optional workflow-level sample queries. Rendered into - a top-level ``Examples:`` section when ``include_examples`` is - ``True``. - include_topology: Whether to embed the JSON-encoded workflow - topology produced by :meth:`Workflow.to_dict`. Defaults to - ``True``. - - Returns: - A ``type="prompt"`` :class:`EvalGenerationSource` describing the - workflow. - """ - prompt = workflow.as_eval_source( - include_instructions=include_instructions, - include_tools=include_tools, - include_context_providers=include_context_providers, - include_examples=include_examples, - examples=examples, - include_topology=include_topology, - ) - return EvalGenerationSource( - type="prompt", - prompt=prompt, - description=workflow.description, - ) + return cls(name=name, version=None, display_name=display_name) # endregion @@ -1237,561 +961,6 @@ async def _evaluate_via_dataset( provider=self.name, ) - @classmethod - @experimental(feature_id=ExperimentalFeature.EVALS) - async def generate_rubric( - cls, - *, - project_client: AIProjectClient, - name: str, - agent: BaseAgent | None = None, - workflow: Workflow | None = None, - sources: Sequence[EvalGenerationSource] | None = None, - category: Literal["quality", "safety"] = "quality", - model: str | None = None, - display_name: str | None = None, - description: str | None = None, - operation_id: str | None = None, - poll_interval: float = 5.0, - timeout: float = 600.0, - ) -> GeneratedEvaluatorRef: - """Generate a Foundry rubric evaluator from an agent or workflow. - - Drives the Foundry evaluator-generation long-running operation - (``client.beta.evaluators.create_generation_job``) end-to-end and - returns a pinned :class:`GeneratedEvaluatorRef` for use with - :class:`FoundryEvals` ``evaluators=`` lists. - - Exactly one of ``agent``, ``workflow``, or ``sources`` must be - supplied. When ``agent`` or ``workflow`` is given, - :func:`agent_as_eval_source` / :func:`workflow_as_eval_source` is - used to build a single conservative source (instructions and - tools included; examples and context providers excluded). Pass - ``sources=`` directly to control inclusion explicitly or to - provide multiple sources. - - Requires ``azure-ai-projects`` with the rubric-generation APIs - (currently ``2.3.0a*`` on the Azure SDK dev feed; tracked for an - upcoming PyPI release). Raises :class:`NotImplementedError` with - a clear message when the dependency is unavailable. - - Keyword Args: - project_client: Async ``AIProjectClient`` for the target - Foundry project. - name: Evaluator name to register in the project. Must be a - stable identifier (e.g. ``"policy-enforcement-v1"``). - agent: Optional ``BaseAgent`` to derive a source from. - workflow: Optional ``Workflow`` to derive a source from. - sources: Explicit list of :class:`EvalGenerationSource` - instances. Mutually exclusive with ``agent`` / ``workflow``. - category: ``"quality"`` or ``"safety"``. Defaults to - ``"quality"``. - model: Optional model deployment to drive generation. When - omitted the service picks a default. - display_name: Optional human-readable name for the evaluator. - description: Optional description for the evaluator. - operation_id: Optional caller-supplied operation id to make - the create call idempotent. - poll_interval: Seconds between job-status polls. - timeout: Maximum seconds to wait for the job to complete. - - Returns: - A pinned :class:`GeneratedEvaluatorRef` referring to the - newly created evaluator. - - Raises: - ValueError: If the source arguments are inconsistent. - NotImplementedError: If the installed ``azure-ai-projects`` - version does not expose the rubric APIs. - TimeoutError: If the job does not complete within ``timeout``. - RuntimeError: If the generation job ends in a non-succeeded - terminal state. - """ - resolved_sources = _coalesce_generation_sources(agent=agent, workflow=workflow, sources=sources) - - if category not in ("quality", "safety"): - raise ValueError(f"category must be 'quality' or 'safety', got {category!r}.") - - try: - sdk_types = _import_generation_sdk_types() - except _RubricSdkUnavailableError as exc: - raise NotImplementedError(str(exc)) from exc - - sdk_sources = [_to_sdk_source(s, sdk_types) for s in resolved_sources] - - inputs_kwargs: dict[str, Any] = { - "name": name, - "category": category, - "sources": sdk_sources, - } - if model is not None: - inputs_kwargs["model"] = model - if display_name is not None: - inputs_kwargs["display_name"] = display_name - if description is not None: - inputs_kwargs["description"] = description - - inputs = sdk_types.EvaluatorGenerationInputs(**inputs_kwargs) - job = sdk_types.EvaluatorGenerationJob(inputs=inputs) - - create_kwargs: dict[str, Any] = {"job": job} - if operation_id is not None: - create_kwargs["operation_id"] = operation_id - - evaluators_ops = _get_beta_evaluators(project_client) - created = await evaluators_ops.create_generation_job(**create_kwargs) - completed = await _poll_generation_job( - evaluators_ops, - created, - poll_interval=poll_interval, - timeout=timeout, - ) - - return _generation_job_to_ref(completed, category=category) - - @classmethod - @experimental(feature_id=ExperimentalFeature.EVALS) - async def create_rubric_evaluator( - cls, - *, - project_client: AIProjectClient, - name: str, - dimensions: Sequence[RubricDimension], - category: Literal["quality", "safety"] = "quality", - pass_threshold: float | None = None, - display_name: str | None = None, - description: str | None = None, - tags: dict[str, str] | None = None, - metadata: dict[str, str] | None = None, - ) -> GeneratedEvaluatorRef: - """Register a rubric evaluator from caller-supplied dimensions. - - This is the *manual* counterpart to :meth:`generate_rubric` and - maps directly to ``project_client.beta.evaluators.create_version``. - Use it to bring a rubric you authored elsewhere (e.g. authored - from an agent's local context, ported from another framework, or - hand-tuned) into Foundry as a versioned ``EvaluatorVersion`` - that any subsequent ``evaluators=`` list can reference via the - returned :class:`GeneratedEvaluatorRef`. - - The service auto-attaches a non-editable residual dimension - (``general_quality`` for ``category="quality"``, - ``general_policy_compliance`` for ``"safety"``) — do not include - it in ``dimensions``. - - Keyword Args: - project_client: Async ``AIProjectClient`` for the target - Foundry project. - name: Stable evaluator name (e.g. - ``"reservation-agent-policy-v1"``). A new version is - allocated on each call. - dimensions: One or more :class:`RubricDimension` instances - describing the scoring blueprint. Each dimension's - ``id`` must be unique; ``weight`` must be in ``[1, 10]``. - category: ``"quality"`` (default) or ``"safety"``. - pass_threshold: Optional aggregate pass threshold on the - normalized 0.0-1.0 scale. Defaults to the service-side - default of ``0.5`` when omitted. - display_name: Optional human-readable name shown in the - Foundry portal. - description: Optional asset description. - tags: Optional asset tags. - metadata: Optional free-form metadata persisted with the - evaluator definition. - - Returns: - A pinned :class:`GeneratedEvaluatorRef` referring to the - newly created evaluator version. - - Raises: - ValueError: If ``dimensions`` is empty, contains duplicate - ids, or contains a weight outside ``[1, 10]``. - NotImplementedError: If the installed ``azure-ai-projects`` - version does not expose the manual rubric APIs. - """ - if category not in ("quality", "safety"): - raise ValueError(f"category must be 'quality' or 'safety', got {category!r}.") - if pass_threshold is not None and not (0.0 <= pass_threshold <= 1.0): - raise ValueError(f"pass_threshold must be in [0.0, 1.0] when set (got {pass_threshold!r}).") - if not dimensions: - raise ValueError("create_rubric_evaluator requires at least one dimension.") - - try: - sdk_types = _import_manual_rubric_sdk_types() - except _RubricSdkUnavailableError as exc: - raise NotImplementedError(str(exc)) from exc - - sdk_dimensions = _to_sdk_dimensions(dimensions, sdk_types.Dimension) - definition_kwargs: dict[str, Any] = {"dimensions": sdk_dimensions} - if pass_threshold is not None: - definition_kwargs["pass_threshold"] = pass_threshold - definition = sdk_types.RubricBasedEvaluatorDefinition(**definition_kwargs) - - version_kwargs: dict[str, Any] = { - "evaluator_type": "custom", - "categories": [category], - "definition": definition, - } - if display_name is not None: - version_kwargs["display_name"] = display_name - if description is not None: - version_kwargs["description"] = description - if tags is not None: - version_kwargs["tags"] = tags - if metadata is not None: - version_kwargs["metadata"] = metadata - - evaluator_version = sdk_types.EvaluatorVersion(**version_kwargs) - evaluators_ops = _get_beta_evaluators(project_client) - created = await evaluators_ops.create_version(name, evaluator_version=evaluator_version) - - return _evaluator_version_to_ref(created, fallback_name=name, category=category) - - -_TERMINAL_GENERATION_STATUSES: frozenset[str] = frozenset({"succeeded", "failed", "cancelled", "canceled"}) - - -class _RubricSdkUnavailableError(Exception): - """Raised when azure-ai-projects lacks the rubric-generation APIs.""" - - -@dataclass(frozen=True) -class _GenerationSdkTypes: - """Resolved SDK type handles for rubric-evaluator generation.""" - - EvaluatorGenerationInputs: Any - EvaluatorGenerationJob: Any - PromptSource: Any - AgentSource: Any | None - DatasetSource: Any | None - TracesSource: Any | None - - -@dataclass(frozen=True) -class _ManualRubricSdkTypes: - """Resolved SDK type handles for manual rubric-evaluator creation.""" - - EvaluatorVersion: Any - RubricBasedEvaluatorDefinition: Any - Dimension: Any - - -_RUBRIC_SDK_MISSING_MSG = ( - "FoundryEvals.generate_rubric requires the rubric-evaluator generation APIs " - "from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev feed). " - "Install a build that exposes " - "`azure.ai.projects.models.EvaluatorGenerationInputs` and " - "`AIProjectClient.beta.evaluators.create_generation_job`." -) - - -_MANUAL_RUBRIC_SDK_MISSING_MSG = ( - "FoundryEvals.create_rubric_evaluator requires the manual rubric-evaluator " - "APIs from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev " - "feed). Install a build that exposes " - "`azure.ai.projects.models.RubricBasedEvaluatorDefinition`, " - "`azure.ai.projects.models.Dimension`, and " - "`AIProjectClient.beta.evaluators.create_version`." -) - - -def _import_generation_sdk_types() -> _GenerationSdkTypes: - """Lazily resolve the rubric-generation SDK types from azure-ai-projects.""" - try: - from azure.ai.projects import models as _models # type: ignore[import-not-found] - except ImportError as exc: - raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG) from exc - - models_mod: Any = _models - inputs_cls: Any = getattr(models_mod, "EvaluatorGenerationInputs", None) - job_cls: Any = getattr(models_mod, "EvaluatorGenerationJob", None) - prompt_cls: Any = getattr(models_mod, "PromptEvaluatorGenerationJobSource", None) - if inputs_cls is None or job_cls is None or prompt_cls is None: - raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG) - - agent_cls: Any = getattr(models_mod, "AgentEvaluatorGenerationJobSource", None) - dataset_cls: Any = getattr(models_mod, "DatasetEvaluatorGenerationJobSource", None) - traces_cls: Any = getattr(models_mod, "TracesEvaluatorGenerationJobSource", None) - - return _GenerationSdkTypes( - EvaluatorGenerationInputs=inputs_cls, - EvaluatorGenerationJob=job_cls, - PromptSource=prompt_cls, - AgentSource=agent_cls, - DatasetSource=dataset_cls, - TracesSource=traces_cls, - ) - - -def _import_manual_rubric_sdk_types() -> _ManualRubricSdkTypes: - """Lazily resolve the manual rubric-evaluator SDK types from azure-ai-projects.""" - try: - from azure.ai.projects import models as _models # type: ignore[import-not-found] - except ImportError as exc: - raise _RubricSdkUnavailableError(_MANUAL_RUBRIC_SDK_MISSING_MSG) from exc - - models_mod: Any = _models - version_cls: Any = getattr(models_mod, "EvaluatorVersion", None) - definition_cls: Any = getattr(models_mod, "RubricBasedEvaluatorDefinition", None) - dimension_cls: Any = getattr(models_mod, "Dimension", None) - if version_cls is None or definition_cls is None or dimension_cls is None: - raise _RubricSdkUnavailableError(_MANUAL_RUBRIC_SDK_MISSING_MSG) - - return _ManualRubricSdkTypes( - EvaluatorVersion=version_cls, - RubricBasedEvaluatorDefinition=definition_cls, - Dimension=dimension_cls, - ) - - -def _to_sdk_dimensions( - dimensions: Sequence[RubricDimension], - dimension_cls: Any, -) -> list[Any]: - """Translate user-facing ``RubricDimension`` instances to SDK ``Dimension`` models. - - The agent-framework type uses ``id`` (matching the runtime output - schema and competing frameworks); the SDK input model uses - ``dimension_id`` for the editable identifier. - """ - if not dimensions: - raise ValueError("create_rubric_evaluator requires at least one dimension.") - seen: set[str] = set() - sdk_dims: list[Any] = [] - for dim in dimensions: - if not dim.id: - raise ValueError("RubricDimension.id must be a non-empty string.") - if not dim.description: - raise ValueError(f"RubricDimension(id={dim.id!r}).description must be non-empty.") - if not isinstance(dim.weight, int) or not (1 <= dim.weight <= 10): - raise ValueError(f"RubricDimension(id={dim.id!r}).weight must be an int in [1, 10] (got {dim.weight!r}).") - if dim.id in seen: - raise ValueError(f"Duplicate RubricDimension.id={dim.id!r}; ids must be unique within a rubric.") - seen.add(dim.id) - kwargs: dict[str, Any] = { - "dimension_id": dim.id, - "description": dim.description, - "weight": dim.weight, - } - if dim.always_applicable: - kwargs["always_applicable"] = True - sdk_dims.append(dimension_cls(**kwargs)) - return sdk_dims - - -def _evaluator_version_to_ref( - created: Any, - *, - fallback_name: str, - category: Literal["quality", "safety"], -) -> GeneratedEvaluatorRef: - """Translate a persisted ``EvaluatorVersion`` to a :class:`GeneratedEvaluatorRef`. - - Used by both the generation-job path and the manual ``create_version`` - path so callers see a uniform pinned reference regardless of how the - evaluator was authored. - """ - ev_name = getattr(created, "name", None) or fallback_name - ev_version = getattr(created, "version", None) - if ev_version is None: - raise RuntimeError("Created evaluator version is missing a version identifier.") - - definition: Any = getattr(created, "definition", None) - dimensions: tuple[RubricDimension, ...] | None = None - raw_dims: Any = getattr(definition, "dimensions", None) if definition is not None else None - if raw_dims: - parsed: list[RubricDimension] = [] - for entry in raw_dims: - dim_id = getattr(entry, "dimension_id", None) or getattr(entry, "id", None) - try: - parsed.append( - RubricDimension( - id=str(dim_id or ""), - description=str(getattr(entry, "description", "") or ""), - weight=int(getattr(entry, "weight", 0) or 0), - always_applicable=bool(getattr(entry, "always_applicable", False)), - ) - ) - except (TypeError, ValueError): - logger.debug("Skipping malformed dimension on persisted evaluator", exc_info=True) - if parsed: - dimensions = tuple(parsed) - - pass_threshold: float | None = None - raw_threshold: Any = getattr(definition, "pass_threshold", None) if definition is not None else None - if isinstance(raw_threshold, (int, float)): - pass_threshold = float(raw_threshold) - - return GeneratedEvaluatorRef( - name=str(ev_name), - version=str(ev_version), - category=category, - display_name=getattr(created, "display_name", None), - description=getattr(created, "description", None), - dimensions=dimensions, - pass_threshold=pass_threshold, - ) - - -def _get_beta_evaluators(project_client: AIProjectClient) -> Any: - """Return the ``project_client.beta.evaluators`` operations group, or raise.""" - beta = getattr(project_client, "beta", None) - evaluators_ops = getattr(beta, "evaluators", None) if beta is not None else None - if evaluators_ops is None: - raise NotImplementedError(_RUBRIC_SDK_MISSING_MSG) - return evaluators_ops - - -def _coalesce_generation_sources( - *, - agent: BaseAgent | None, - workflow: Workflow | None, - sources: Sequence[EvalGenerationSource] | None, -) -> list[EvalGenerationSource]: - if sources is not None and not sources: - raise ValueError("sources= must contain at least one EvalGenerationSource.") - supplied = [bool(agent), bool(workflow), bool(sources)] - if sum(supplied) == 0: - raise ValueError("Provide one of agent=, workflow=, or sources=.") - if sum(supplied) > 1: - raise ValueError("Provide only one of agent=, workflow=, or sources=.") - if sources is not None: - return list(sources) - if agent is not None: - return [agent_as_eval_source(agent)] - if workflow is None: - raise ValueError("workflow= must be provided when agent= and sources= are not set.") - return [workflow_as_eval_source(workflow)] - - -def _to_sdk_source(source: EvalGenerationSource, sdk_types: _GenerationSdkTypes) -> Any: - """Translate an :class:`EvalGenerationSource` to its SDK counterpart.""" - if source.type == "prompt": - if not source.prompt: - raise ValueError("EvalGenerationSource(type='prompt') requires a non-empty prompt.") - kwargs: dict[str, Any] = {"prompt": source.prompt} - if source.description is not None: - kwargs["description"] = source.description - return sdk_types.PromptSource(**kwargs) - if source.type == "agent": - if sdk_types.AgentSource is None: - raise NotImplementedError("Installed azure-ai-projects does not expose AgentEvaluatorGenerationJobSource.") - if not source.agent_name: - raise ValueError("EvalGenerationSource(type='agent') requires agent_name.") - kwargs = {"agent_name": source.agent_name} - if source.agent_version is not None: - kwargs["agent_version"] = source.agent_version - if source.description is not None: - kwargs["description"] = source.description - return sdk_types.AgentSource(**kwargs) - if source.type == "dataset": - if sdk_types.DatasetSource is None: - raise NotImplementedError( - "Installed azure-ai-projects does not expose DatasetEvaluatorGenerationJobSource." - ) - if not source.dataset_name: - raise ValueError("EvalGenerationSource(type='dataset') requires dataset_name.") - # SDK uses ``name`` / ``version`` (not ``dataset_name`` / ``dataset_version``). - kwargs = {"name": source.dataset_name} - if source.dataset_version is not None: - kwargs["version"] = source.dataset_version - if source.description is not None: - kwargs["description"] = source.description - return sdk_types.DatasetSource(**kwargs) - if source.type == "traces": - if sdk_types.TracesSource is None: - raise NotImplementedError("Installed azure-ai-projects does not expose TracesEvaluatorGenerationJobSource.") - kwargs = {} - if source.metadata is not None: - kwargs["metadata"] = source.metadata - if source.description is not None: - kwargs["description"] = source.description - return sdk_types.TracesSource(**kwargs) - raise ValueError(f"Unknown EvalGenerationSource type: {source.type!r}") - - -async def _poll_generation_job( - evaluators_ops: Any, - job: Any, - *, - poll_interval: float, - timeout: float, -) -> Any: - """Poll a rubric-generation job until it reaches a terminal state.""" - job_id = getattr(job, "id", None) - if not job_id: - raise RuntimeError("Rubric generation job did not return an id.") - - loop = asyncio.get_running_loop() - deadline = loop.time() + timeout - current = job - while True: - status = (getattr(current, "status", "") or "").lower() - if status in _TERMINAL_GENERATION_STATUSES: - if status != "succeeded": - err = getattr(current, "error", None) - err_msg = getattr(err, "message", None) or str(err) if err is not None else status - raise RuntimeError(f"Rubric generation job {job_id} ended in status {status!r}: {err_msg}") - return current - remaining = deadline - loop.time() - if remaining <= 0: - raise TimeoutError( - f"Rubric generation job {job_id} did not complete within {timeout}s (last status: {status!r})." - ) - await asyncio.sleep(min(poll_interval, remaining)) - current = await evaluators_ops.get_generation_job(job_id) - - -def _generation_job_to_ref(job: Any, *, category: Literal["quality", "safety"]) -> GeneratedEvaluatorRef: - """Build a pinned :class:`GeneratedEvaluatorRef` from a completed job.""" - artifacts: Any = getattr(job, "artifacts", None) - evaluator: Any = getattr(artifacts, "evaluator", None) if artifacts is not None else None - if evaluator is None: - raise RuntimeError("Rubric generation job completed without an evaluator artifact.") - - ev_name = getattr(evaluator, "name", None) - ev_version = getattr(evaluator, "version", None) - if not ev_name: - raise RuntimeError("Generated evaluator artifact is missing a name.") - if ev_version is None: - raise RuntimeError("Generated evaluator artifact is missing a version.") - - definition: Any = getattr(evaluator, "definition", None) - dimensions_raw: Any = getattr(definition, "dimensions", None) if definition is not None else None - dimensions: tuple[RubricDimension, ...] | None = None - if dimensions_raw: - parsed: list[RubricDimension] = [] - for entry in dimensions_raw: - try: - parsed.append( - RubricDimension( - id=str(getattr(entry, "id", "") or ""), - description=str(getattr(entry, "description", "") or ""), - weight=int(getattr(entry, "weight", 0) or 0), - always_applicable=bool(getattr(entry, "always_applicable", False)), - ) - ) - except (TypeError, ValueError): - logger.debug("Skipping malformed dimension on generated evaluator", exc_info=True) - if parsed: - dimensions = tuple(parsed) - - pass_threshold: float | None = None - if definition is not None: - raw_threshold = getattr(definition, "pass_threshold", None) - if isinstance(raw_threshold, (int, float)): - pass_threshold = float(raw_threshold) - - return GeneratedEvaluatorRef( - name=str(ev_name), - version=str(ev_version), - category=category, - display_name=getattr(evaluator, "display_name", None), - description=getattr(evaluator, "description", None), - dimensions=dimensions, - pass_threshold=pass_threshold, - ) - # --------------------------------------------------------------------------- # Foundry-specific functions (not part of the Evaluator protocol) diff --git a/python/packages/foundry/tests/test_evals_config.py b/python/packages/foundry/tests/test_evals_config.py deleted file mode 100644 index a1c86187d47..00000000000 --- a/python/packages/foundry/tests/test_evals_config.py +++ /dev/null @@ -1,273 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -"""Tests for the YAML-driven evaluator configuration loader.""" - -from __future__ import annotations - -import textwrap -from pathlib import Path -from typing import Any -from unittest.mock import MagicMock - -import pytest - -from agent_framework_foundry._evals_config import ( - RubricGenerationSpec, - RubricSourceSpec, - build_sources, - load_evals_config, - parse_evals_config, -) -from agent_framework_foundry._foundry_evals import EvalGenerationSource - - -def _make_agent(name: str = "agent-a", instructions: str = "Be brief.") -> Any: - from agent_framework._evaluation import _render_agent_dossier - - agent = MagicMock() - agent.name = name - agent.description = f"{name} description" - agent.default_options = {"instructions": instructions, "tools": []} - agent.context_providers = [] - agent.mcp_tools = [] - agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier( - agent, - include_instructions=kw.get("include_instructions", True), - include_tools=kw.get("include_tools", True), - include_context_providers=kw.get("include_context_providers", False), - include_examples=kw.get("include_examples", False), - examples=kw.get("examples"), - ) - return agent - - -def _make_workflow() -> Any: - from agent_framework._evaluation import _render_workflow_dossier - - workflow = MagicMock() - workflow.name = "wf-1" - workflow.description = "demo" - workflow.to_dict.return_value = {"name": "wf-1", "id": "wf_1", "executors": {}, "edge_groups": []} - workflow.executors = {} - workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier( - workflow, - include_instructions=kw.get("include_instructions", True), - include_tools=kw.get("include_tools", True), - include_context_providers=kw.get("include_context_providers", False), - include_examples=kw.get("include_examples", False), - examples=kw.get("examples"), - include_topology=kw.get("include_topology", True), - ) - return workflow - - -class TestParseEvalsConfig: - """Parsing already-loaded dicts into RubricGenerationSpec instances.""" - - def test_minimal_spec(self) -> None: - config = parse_evals_config({ - "evaluators": { - "my-rubric": { - "type": "foundry.generated_rubric", - } - } - }) - assert "my-rubric" in config - spec = config["my-rubric"] - assert spec.name == "my-rubric" - assert spec.type == "foundry.generated_rubric" - assert spec.category == "quality" - assert spec.sources == () - - def test_full_spec_with_sources(self) -> None: - config = parse_evals_config({ - "evaluators": { - "reservation-quality": { - "type": "foundry.generated_rubric", - "category": "quality", - "model": "gpt-4o", - "agent": "reservation-agent", - "display_name": "Reservation Quality", - "description": "Custom rubric for reservation agent.", - "sources": [ - { - "type": "agent", - "include_instructions": True, - "include_tools": True, - "include_context_providers": True, - }, - { - "type": "dataset", - "name": "reservation-business-rules", - "version": 1, - }, - ], - } - } - }) - spec = config["reservation-quality"] - assert spec.model == "gpt-4o" - assert spec.agent == "reservation-agent" - assert spec.display_name == "Reservation Quality" - assert len(spec.sources) == 2 - - agent_src = spec.sources[0] - assert agent_src.type == "agent" - assert agent_src.include_context_providers is True - - dataset_src = spec.sources[1] - assert dataset_src.type == "dataset" - assert dataset_src.name == "reservation-business-rules" - assert dataset_src.version == "1" # coerced to string - - def test_rejects_non_mapping(self) -> None: - with pytest.raises(ValueError, match="must be a mapping"): - parse_evals_config([]) - - def test_rejects_missing_evaluators_key(self) -> None: - with pytest.raises(ValueError, match="evaluators"): - parse_evals_config({"other": {}}) - - def test_rejects_unknown_type(self) -> None: - with pytest.raises(ValueError, match="unsupported type"): - parse_evals_config({"evaluators": {"x": {"type": "foundry.other"}}}) - - def test_rejects_invalid_category(self) -> None: - with pytest.raises(ValueError, match="invalid category"): - parse_evals_config({"evaluators": {"x": {"type": "foundry.generated_rubric", "category": "bogus"}}}) - - def test_rejects_invalid_source_type(self) -> None: - with pytest.raises(ValueError, match="invalid type"): - parse_evals_config({ - "evaluators": { - "x": { - "type": "foundry.generated_rubric", - "sources": [{"type": "bogus"}], - } - } - }) - - -class TestLoadEvalsConfig: - """End-to-end YAML loading.""" - - def test_load_from_yaml_file(self, tmp_path: Path) -> None: - pytest.importorskip("yaml") - config_path = tmp_path / "evals.yaml" - config_path.write_text( - textwrap.dedent( - """\ - evaluators: - my-eval: - type: foundry.generated_rubric - category: safety - model: gpt-4o-mini - sources: - - type: prompt - prompt: "Score the response." - """ - ), - encoding="utf-8", - ) - config = load_evals_config(config_path) - assert "my-eval" in config - spec = config["my-eval"] - assert spec.category == "safety" - assert spec.model == "gpt-4o-mini" - assert len(spec.sources) == 1 - assert spec.sources[0].type == "prompt" - assert spec.sources[0].prompt == "Score the response." - - -class TestBuildSources: - """Translate RubricGenerationSpec sources into EvalGenerationSource instances.""" - - def test_no_sources_with_agent_default(self) -> None: - spec = RubricGenerationSpec(name="x") - agent = _make_agent() - sources = build_sources(spec, agent=agent) - assert len(sources) == 1 - assert sources[0].type == "prompt" - assert sources[0].prompt is not None - assert "Agent name: agent-a" in sources[0].prompt - - def test_no_sources_with_workflow_default(self) -> None: - spec = RubricGenerationSpec(name="x") - workflow = _make_workflow() - sources = build_sources(spec, workflow=workflow) - assert len(sources) == 1 - assert sources[0].type == "prompt" - assert sources[0].prompt is not None - assert "Workflow name: wf-1" in sources[0].prompt - - def test_no_sources_no_agent_or_workflow_raises(self) -> None: - spec = RubricGenerationSpec(name="x") - with pytest.raises(ValueError, match="no sources"): - build_sources(spec) - - def test_agent_source_uses_supplied_agent(self) -> None: - spec = RubricGenerationSpec( - name="x", - sources=(RubricSourceSpec(type="agent", include_context_providers=True),), - ) - agent = _make_agent() - sources = build_sources(spec, agent=agent) - assert sources[0].type == "prompt" - assert sources[0].prompt is not None - assert "Agent name: agent-a" in sources[0].prompt - - def test_agent_source_with_agent_name_uses_hosted_path(self) -> None: - spec = RubricGenerationSpec( - name="x", - sources=(RubricSourceSpec(type="agent", agent_name="hosted-foundry-agent"),), - ) - sources = build_sources(spec) - assert sources[0].type == "agent" - assert sources[0].agent_name == "hosted-foundry-agent" - - def test_agent_source_without_agent_raises(self) -> None: - spec = RubricGenerationSpec( - name="x", - sources=(RubricSourceSpec(type="agent"),), - ) - with pytest.raises(ValueError, match="no agent="): - build_sources(spec) - - def test_workflow_source_uses_supplied_workflow(self) -> None: - spec = RubricGenerationSpec( - name="x", - sources=(RubricSourceSpec(type="workflow", include_topology=False),), - ) - workflow = _make_workflow() - sources = build_sources(spec, workflow=workflow) - assert sources[0].type == "prompt" - assert sources[0].prompt is not None - assert "Workflow name: wf-1" in sources[0].prompt - assert "Topology (JSON):" not in sources[0].prompt - - def test_prompt_source_translates_directly(self) -> None: - spec = RubricGenerationSpec( - name="x", - sources=(RubricSourceSpec(type="prompt", prompt="Score it."),), - ) - sources = build_sources(spec) - assert sources[0] == EvalGenerationSource(type="prompt", prompt="Score it.") - - def test_dataset_source_translates(self) -> None: - spec = RubricGenerationSpec( - name="x", - sources=(RubricSourceSpec(type="dataset", name="ds", version="2"),), - ) - sources = build_sources(spec) - assert sources[0].type == "dataset" - assert sources[0].dataset_name == "ds" - assert sources[0].dataset_version == "2" - - def test_traces_source_passes_metadata(self) -> None: - spec = RubricGenerationSpec( - name="x", - sources=(RubricSourceSpec(type="traces", metadata={"environment": "prod"}),), - ) - sources = build_sources(spec) - assert sources[0].type == "traces" - assert sources[0].metadata == {"environment": "prod"} diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index d24c528a744..df8627352bb 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -6,7 +6,7 @@ import json from dataclasses import dataclass -from typing import Any, cast +from typing import Any from unittest.mock import AsyncMock, MagicMock import pytest @@ -27,7 +27,6 @@ from agent_framework_foundry._foundry_evals import ( FoundryEvals, - RubricDimension, _build_item_schema, _build_testing_criteria, _extract_per_evaluator, @@ -65,32 +64,6 @@ def _make_tool(name: str) -> MagicMock: return t -def _make_stub_agent( - *, - name: str = "alpha", - description: str = "An agent.", - instructions: str = "Be brief.", -) -> MagicMock: - """Mock agent whose as_eval_source returns a real dossier string.""" - from agent_framework._evaluation import _render_agent_dossier - - agent = MagicMock() - agent.name = name - agent.description = description - agent.default_options = {"instructions": instructions, "tools": []} - agent.context_providers = [] - agent.mcp_tools = [] - agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier( - agent, - include_instructions=kw.get("include_instructions", True), - include_tools=kw.get("include_tools", True), - include_context_providers=kw.get("include_context_providers", False), - include_examples=kw.get("include_examples", False), - examples=kw.get("examples"), - ) - return agent - - @dataclass class _MockResultCounts: """Mock matching the OpenAI SDK ResultCounts Pydantic model shape.""" @@ -3045,729 +3018,3 @@ async def test_target_without_type_raises(self) -> None: client=mock_client, model="gpt-4o", ) - - -class TestFoundryAgentAsEvalSource: - """Tests for foundry's agent_as_eval_source helper (wraps BaseAgent.as_eval_source).""" - - def test_returns_prompt_source_with_dossier(self) -> None: - from agent_framework_foundry._foundry_evals import agent_as_eval_source - - agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") - source = agent_as_eval_source(agent) - assert source.type == "prompt" - assert source.description == "Looks up the weather." - assert source.prompt is not None - assert "Agent name: weather-bot" in source.prompt - assert "Be brief." in source.prompt - - def test_hosted_agent_name_emits_agent_source(self) -> None: - from agent_framework_foundry._foundry_evals import agent_as_eval_source - - agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") - source = agent_as_eval_source(agent, hosted_agent_name="weather-bot-hosted-id") - assert source.type == "agent" - assert source.agent_name == "weather-bot-hosted-id" - assert source.prompt is None - assert source.description == "Looks up the weather." - - def test_explicit_hosted_agent_version_forwarded(self) -> None: - from agent_framework_foundry._foundry_evals import agent_as_eval_source - - agent = _make_stub_agent(name="weather-bot") - source = agent_as_eval_source( - agent, - hosted_agent_name="weather-bot-hosted-id", - hosted_agent_version="3", - ) - assert source.type == "agent" - assert source.agent_name == "weather-bot-hosted-id" - assert source.agent_version == "3" - - def test_auto_detects_hosted_foundry_agent(self) -> None: - """A chat_client carrying agent_name/agent_version is treated as a hosted agent.""" - from agent_framework_foundry._foundry_evals import agent_as_eval_source - - agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") - agent.chat_client = MagicMock() - agent.chat_client.agent_name = "weather-prompt-agent" - agent.chat_client.agent_version = "2" - - source = agent_as_eval_source(agent) - assert source.type == "agent" - assert source.agent_name == "weather-prompt-agent" - assert source.agent_version == "2" - assert source.prompt is None - assert source.description == "Looks up the weather." - - def test_auto_detection_handles_versionless_hosted_agent(self) -> None: - """HostedAgents typically omit agent_version (no None forwarded).""" - from agent_framework_foundry._foundry_evals import agent_as_eval_source - - agent = _make_stub_agent(name="weather-bot") - agent.chat_client = MagicMock() - agent.chat_client.agent_name = "weather-hosted-agent" - agent.chat_client.agent_version = None - - source = agent_as_eval_source(agent) - assert source.type == "agent" - assert source.agent_name == "weather-hosted-agent" - assert source.agent_version is None - - def test_force_prompt_source_overrides_auto_detection(self) -> None: - """force_prompt_source=True falls back to dossier even for hosted agents.""" - from agent_framework_foundry._foundry_evals import agent_as_eval_source - - agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.") - agent.chat_client = MagicMock() - agent.chat_client.agent_name = "weather-prompt-agent" - agent.chat_client.agent_version = "2" - - source = agent_as_eval_source(agent, force_prompt_source=True) - assert source.type == "prompt" - assert source.prompt is not None - assert "Agent name: weather-bot" in source.prompt - - def test_auto_detection_ignores_non_string_chat_client_fields(self) -> None: - """Bare MagicMock chat_client (untyped attrs) must not trigger detection.""" - from agent_framework_foundry._foundry_evals import agent_as_eval_source - - agent = _make_stub_agent(name="local-agent") - agent.chat_client = MagicMock() # agent_name attr resolves to a MagicMock, not a str - - source = agent_as_eval_source(agent) - assert source.type == "prompt" - assert source.prompt is not None - assert "Agent name: local-agent" in source.prompt - - def test_forwards_keyword_options_to_agent(self) -> None: - from agent_framework_foundry._foundry_evals import agent_as_eval_source - - agent = _make_stub_agent() - source = agent_as_eval_source(agent, include_instructions=False) - assert source.prompt is not None - assert "Instructions:" not in source.prompt - - -class TestFoundryWorkflowAsEvalSource: - """Tests for foundry's workflow_as_eval_source helper (wraps Workflow.as_eval_source).""" - - def _make_workflow(self) -> MagicMock: - from agent_framework._evaluation import _render_workflow_dossier - - workflow = MagicMock() - workflow.name = "demo-workflow" - workflow.description = "Routes user questions." - workflow.to_dict.return_value = { - "name": "demo-workflow", - "id": "wf_1", - "executors": {}, - "edge_groups": [], - } - workflow.executors = {} - workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier( - workflow, - include_instructions=kw.get("include_instructions", True), - include_tools=kw.get("include_tools", True), - include_context_providers=kw.get("include_context_providers", False), - include_examples=kw.get("include_examples", False), - examples=kw.get("examples"), - include_topology=kw.get("include_topology", True), - ) - return workflow - - def test_returns_prompt_source_with_topology(self) -> None: - from agent_framework_foundry._foundry_evals import workflow_as_eval_source - - workflow = self._make_workflow() - source = workflow_as_eval_source(workflow) - assert source.type == "prompt" - assert source.description == "Routes user questions." - assert source.prompt is not None - assert "Workflow name: demo-workflow" in source.prompt - assert "Topology (JSON):" in source.prompt - - def test_topology_can_be_disabled(self) -> None: - from agent_framework_foundry._foundry_evals import workflow_as_eval_source - - workflow = self._make_workflow() - source = workflow_as_eval_source(workflow, include_topology=False) - assert source.prompt is not None - assert "Topology (JSON):" not in source.prompt - - -class TestCoalesceGenerationSources: - """Validation for the source-resolution helper used by FoundryEvals.generate_rubric.""" - - def test_requires_exactly_one_source(self) -> None: - from agent_framework_foundry._foundry_evals import _coalesce_generation_sources - - with pytest.raises(ValueError, match="Provide one of"): - _coalesce_generation_sources(agent=None, workflow=None, sources=None) - - def test_rejects_multiple_sources(self) -> None: - from agent_framework_foundry._foundry_evals import EvalGenerationSource, _coalesce_generation_sources - - agent = MagicMock() - agent.name = "a" - agent.description = None - agent.default_options = {"instructions": "x", "tools": []} - agent.context_providers = [] - agent.mcp_tools = [] - with pytest.raises(ValueError, match="only one of"): - _coalesce_generation_sources( - agent=agent, - workflow=None, - sources=[EvalGenerationSource(type="prompt", prompt="hi")], - ) - - def test_uses_agent_helper_when_only_agent_supplied(self) -> None: - from agent_framework_foundry._foundry_evals import _coalesce_generation_sources - - agent = _make_stub_agent(name="alpha", description="An agent.") - - sources = _coalesce_generation_sources(agent=agent, workflow=None, sources=None) - assert len(sources) == 1 - assert sources[0].type == "prompt" - assert sources[0].prompt is not None - assert "Agent name: alpha" in sources[0].prompt - - def test_rejects_empty_sources_list(self) -> None: - from agent_framework_foundry._foundry_evals import _coalesce_generation_sources - - with pytest.raises(ValueError, match="at least one"): - _coalesce_generation_sources(agent=None, workflow=None, sources=[]) - - -class TestToSdkSource: - """Translation between EvalGenerationSource and SDK *JobSource types.""" - - def _make_sdk_types(self, *, with_agent: bool = True, with_dataset: bool = True, with_traces: bool = True) -> Any: - from agent_framework_foundry._foundry_evals import _GenerationSdkTypes - - return _GenerationSdkTypes( - EvaluatorGenerationInputs=MagicMock(), - EvaluatorGenerationJob=MagicMock(), - PromptSource=MagicMock(name="PromptSource"), - AgentSource=MagicMock(name="AgentSource") if with_agent else None, - DatasetSource=MagicMock(name="DatasetSource") if with_dataset else None, - TracesSource=MagicMock(name="TracesSource") if with_traces else None, - ) - - def test_prompt_source_is_translated(self) -> None: - from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source - - sdk = self._make_sdk_types() - sdk.PromptSource.return_value = "prompt-sdk-instance" - out = _to_sdk_source( - EvalGenerationSource(type="prompt", prompt="hello", description="d"), - sdk, - ) - assert out == "prompt-sdk-instance" - sdk.PromptSource.assert_called_once_with(prompt="hello", description="d") - - def test_prompt_without_text_raises(self) -> None: - from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source - - sdk = self._make_sdk_types() - with pytest.raises(ValueError, match="non-empty prompt"): - _to_sdk_source(EvalGenerationSource(type="prompt"), sdk) - - def test_agent_source_is_translated(self) -> None: - from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source - - sdk = self._make_sdk_types() - sdk.AgentSource.return_value = "agent-sdk-instance" - out = _to_sdk_source( - EvalGenerationSource(type="agent", agent_name="my-hosted-agent"), - sdk, - ) - assert out == "agent-sdk-instance" - sdk.AgentSource.assert_called_once_with(agent_name="my-hosted-agent") - - def test_agent_source_requires_name(self) -> None: - from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source - - sdk = self._make_sdk_types() - with pytest.raises(ValueError, match="agent_name"): - _to_sdk_source(EvalGenerationSource(type="agent"), sdk) - - def test_agent_source_raises_when_sdk_missing(self) -> None: - from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source - - sdk = self._make_sdk_types(with_agent=False) - with pytest.raises(NotImplementedError, match="AgentEvaluatorGenerationJobSource"): - _to_sdk_source( - EvalGenerationSource(type="agent", agent_name="x"), - sdk, - ) - - def test_dataset_source_is_translated(self) -> None: - from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source - - sdk = self._make_sdk_types() - sdk.DatasetSource.return_value = "dataset-sdk-instance" - out = _to_sdk_source( - EvalGenerationSource(type="dataset", dataset_name="ds", dataset_version="1"), - sdk, - ) - assert out == "dataset-sdk-instance" - sdk.DatasetSource.assert_called_once_with(name="ds", version="1") - - def test_agent_source_forwards_agent_version(self) -> None: - from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source - - sdk = self._make_sdk_types() - sdk.AgentSource.return_value = "agent-sdk-instance" - out = _to_sdk_source( - EvalGenerationSource(type="agent", agent_name="prompt-agent", agent_version="2"), - sdk, - ) - assert out == "agent-sdk-instance" - sdk.AgentSource.assert_called_once_with(agent_name="prompt-agent", agent_version="2") - - -class TestPollGenerationJob: - """Behavior of the rubric-generation polling loop.""" - - async def test_returns_immediately_on_succeeded(self) -> None: - from agent_framework_foundry._foundry_evals import _poll_generation_job - - evaluators_ops = MagicMock() - evaluators_ops.get_generation_job = AsyncMock() - job = MagicMock(id="job_1", status="succeeded") - out = await _poll_generation_job(evaluators_ops, job, poll_interval=0.01, timeout=1.0) - assert out is job - evaluators_ops.get_generation_job.assert_not_called() - - async def test_polls_until_terminal(self) -> None: - from agent_framework_foundry._foundry_evals import _poll_generation_job - - running = MagicMock(id="job_1", status="running") - succeeded = MagicMock(id="job_1", status="succeeded") - evaluators_ops = MagicMock() - evaluators_ops.get_generation_job = AsyncMock(side_effect=[running, succeeded]) - - initial = MagicMock(id="job_1", status="running") - out = await _poll_generation_job(evaluators_ops, initial, poll_interval=0.001, timeout=1.0) - assert out is succeeded - assert evaluators_ops.get_generation_job.await_count == 2 - - async def test_failed_status_raises(self) -> None: - from agent_framework_foundry._foundry_evals import _poll_generation_job - - err = MagicMock(message="boom") - terminal = MagicMock(id="job_1", status="failed", error=err) - evaluators_ops = MagicMock() - evaluators_ops.get_generation_job = AsyncMock(return_value=terminal) - - with pytest.raises(RuntimeError, match="boom"): - await _poll_generation_job( - evaluators_ops, - MagicMock(id="job_1", status="running"), - poll_interval=0.001, - timeout=1.0, - ) - - async def test_timeout_raises(self) -> None: - from agent_framework_foundry._foundry_evals import _poll_generation_job - - running = MagicMock(id="job_1", status="running") - evaluators_ops = MagicMock() - evaluators_ops.get_generation_job = AsyncMock(return_value=running) - - with pytest.raises(TimeoutError): - await _poll_generation_job(evaluators_ops, running, poll_interval=0.001, timeout=0.005) - - -class TestGenerationJobToRef: - """Translation of a completed generation job to a GeneratedEvaluatorRef.""" - - def test_builds_pinned_ref_with_dimensions(self) -> None: - from agent_framework_foundry._foundry_evals import RubricDimension, _generation_job_to_ref - - dim = MagicMock(id="d1", description="dim", weight=2, always_applicable=True) - definition = MagicMock(dimensions=[dim], pass_threshold=0.75) - evaluator = MagicMock( - name="my-eval", - version=3, - display_name="My Eval", - description="A custom rubric.", - definition=definition, - ) - evaluator.name = "my-eval" - job = MagicMock(artifacts=MagicMock(evaluator=evaluator)) - - ref = _generation_job_to_ref(job, category="quality") - assert ref.name == "my-eval" - assert ref.version == "3" - assert ref.display_name == "My Eval" - assert ref.description == "A custom rubric." - assert ref.category == "quality" - assert ref.pass_threshold == 0.75 - assert ref.dimensions is not None - assert ref.dimensions[0] == RubricDimension(id="d1", description="dim", weight=2, always_applicable=True) - - def test_missing_artifacts_raises(self) -> None: - from agent_framework_foundry._foundry_evals import _generation_job_to_ref - - job = MagicMock(artifacts=None) - with pytest.raises(RuntimeError, match="evaluator artifact"): - _generation_job_to_ref(job, category="quality") - - -class TestGenerateRubricSdkMissing: - """generate_rubric raises NotImplementedError when SDK lacks the rubric APIs.""" - - async def test_raises_when_sdk_types_unavailable(self, monkeypatch: pytest.MonkeyPatch) -> None: - from agent_framework_foundry import _foundry_evals as fm - from agent_framework_foundry._foundry_evals import EvalGenerationSource - - def _raise() -> Any: - raise fm._RubricSdkUnavailableError(fm._RUBRIC_SDK_MISSING_MSG) - - monkeypatch.setattr(fm, "_import_generation_sdk_types", _raise) - - project_client = MagicMock() - - with pytest.raises(NotImplementedError, match="rubric"): - await FoundryEvals.generate_rubric( - project_client=project_client, - name="my-eval", - sources=[EvalGenerationSource(type="prompt", prompt="hi")], - ) - - async def test_raises_value_error_on_invalid_category(self) -> None: - """category outside {quality, safety} should fail fast at the boundary.""" - from agent_framework_foundry._foundry_evals import EvalGenerationSource - - project_client = MagicMock() - - with pytest.raises(ValueError, match="category"): - await FoundryEvals.generate_rubric( - project_client=project_client, - name="my-eval", - sources=[EvalGenerationSource(type="prompt", prompt="hi")], - category=cast("Any", "invalid"), - ) - - -class TestGenerateRubricE2E: - """End-to-end happy path for generate_rubric with mocked SDK.""" - - async def test_generate_rubric_from_agent(self, monkeypatch: pytest.MonkeyPatch) -> None: - from agent_framework_foundry import _foundry_evals as fm - - # Stub SDK type handles - prompt_cls = MagicMock(name="PromptSource") - prompt_cls.return_value = "sdk-prompt" - inputs_cls = MagicMock(name="EvaluatorGenerationInputs") - inputs_cls.return_value = "sdk-inputs" - job_cls = MagicMock(name="EvaluatorGenerationJob") - job_cls.return_value = "sdk-job" - - sdk_types = fm._GenerationSdkTypes( - EvaluatorGenerationInputs=inputs_cls, - EvaluatorGenerationJob=job_cls, - PromptSource=prompt_cls, - AgentSource=None, - DatasetSource=None, - TracesSource=None, - ) - monkeypatch.setattr(fm, "_import_generation_sdk_types", lambda: sdk_types) - - # Mock the SDK operations and completed job - completed_evaluator = MagicMock(version="7", display_name=None, description=None) - completed_evaluator.name = "agent-rubric" - completed_evaluator.definition = MagicMock(dimensions=[], pass_threshold=None) - completed = MagicMock( - id="job_42", - status="succeeded", - artifacts=MagicMock(evaluator=completed_evaluator), - ) - - evaluators_ops = MagicMock() - evaluators_ops.create_generation_job = AsyncMock(return_value=completed) - evaluators_ops.get_generation_job = AsyncMock(return_value=completed) - project_client = MagicMock() - project_client.beta = MagicMock(evaluators=evaluators_ops) - - # Build a stub agent - agent = _make_stub_agent( - name="weather-bot", - description="Looks up weather.", - instructions="Be brief.", - ) - - ref = await FoundryEvals.generate_rubric( - project_client=project_client, - name="agent-rubric", - agent=agent, - category="quality", - model="gpt-4o", - display_name="Display", - description="Desc", - operation_id="op-123", - ) - - assert ref.name == "agent-rubric" - assert ref.version == "7" - assert ref.category == "quality" - - # Verify inputs/job/source assembly - prompt_cls.assert_called_once() - prompt_kwargs = prompt_cls.call_args.kwargs - assert "Agent name: weather-bot" in prompt_kwargs["prompt"] - assert prompt_kwargs["description"] == "Looks up weather." - - inputs_cls.assert_called_once() - inputs_kwargs = inputs_cls.call_args.kwargs - assert inputs_kwargs["name"] == "agent-rubric" - assert inputs_kwargs["category"] == "quality" - assert inputs_kwargs["model"] == "gpt-4o" - assert inputs_kwargs["display_name"] == "Display" - assert inputs_kwargs["description"] == "Desc" - assert inputs_kwargs["sources"] == ["sdk-prompt"] - - job_cls.assert_called_once_with(inputs="sdk-inputs") - evaluators_ops.create_generation_job.assert_awaited_once_with(job="sdk-job", operation_id="op-123") - - -# --------------------------------------------------------------------------- -# FoundryEvals.create_rubric_evaluator — manual rubric registration -# --------------------------------------------------------------------------- - - -class TestCreateRubricEvaluatorValidation: - """Argument validation for ``FoundryEvals.create_rubric_evaluator``.""" - - async def test_rejects_empty_dimensions(self) -> None: - with pytest.raises(ValueError, match="at least one dimension"): - await FoundryEvals.create_rubric_evaluator( - project_client=MagicMock(), - name="x", - dimensions=[], - ) - - async def test_rejects_invalid_category(self) -> None: - with pytest.raises(ValueError, match="category"): - await FoundryEvals.create_rubric_evaluator( - project_client=MagicMock(), - name="x", - dimensions=[RubricDimension(id="a", description="d", weight=5)], - category="bogus", # type: ignore[arg-type] - ) - - async def test_rejects_out_of_range_pass_threshold(self) -> None: - with pytest.raises(ValueError, match="pass_threshold"): - await FoundryEvals.create_rubric_evaluator( - project_client=MagicMock(), - name="x", - dimensions=[RubricDimension(id="a", description="d", weight=5)], - pass_threshold=1.5, - ) - - async def test_rejects_duplicate_dimension_ids(self, monkeypatch: pytest.MonkeyPatch) -> None: - from agent_framework_foundry import _foundry_evals as fm - - sdk = fm._ManualRubricSdkTypes( - EvaluatorVersion=MagicMock(), - RubricBasedEvaluatorDefinition=MagicMock(), - Dimension=MagicMock(), - ) - monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk) - with pytest.raises(ValueError, match="Duplicate"): - await FoundryEvals.create_rubric_evaluator( - project_client=MagicMock(), - name="x", - dimensions=[ - RubricDimension(id="dup", description="d1", weight=5), - RubricDimension(id="dup", description="d2", weight=3), - ], - ) - - async def test_rejects_weight_out_of_range(self, monkeypatch: pytest.MonkeyPatch) -> None: - from agent_framework_foundry import _foundry_evals as fm - - sdk = fm._ManualRubricSdkTypes( - EvaluatorVersion=MagicMock(), - RubricBasedEvaluatorDefinition=MagicMock(), - Dimension=MagicMock(), - ) - monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk) - with pytest.raises(ValueError, match="weight"): - await FoundryEvals.create_rubric_evaluator( - project_client=MagicMock(), - name="x", - dimensions=[RubricDimension(id="a", description="d", weight=0)], - ) - - async def test_rejects_empty_description(self, monkeypatch: pytest.MonkeyPatch) -> None: - from agent_framework_foundry import _foundry_evals as fm - - sdk = fm._ManualRubricSdkTypes( - EvaluatorVersion=MagicMock(), - RubricBasedEvaluatorDefinition=MagicMock(), - Dimension=MagicMock(), - ) - monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk) - with pytest.raises(ValueError, match="description"): - await FoundryEvals.create_rubric_evaluator( - project_client=MagicMock(), - name="x", - dimensions=[RubricDimension(id="a", description="", weight=5)], - ) - - -class TestCreateRubricEvaluatorSdkMissing: - async def test_raises_not_implemented_when_sdk_lacks_types(self, monkeypatch: pytest.MonkeyPatch) -> None: - from agent_framework_foundry import _foundry_evals as fm - - def _raise() -> Any: - raise fm._RubricSdkUnavailableError("nope") - - monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", _raise) - with pytest.raises(NotImplementedError, match="nope"): - await FoundryEvals.create_rubric_evaluator( - project_client=MagicMock(), - name="x", - dimensions=[RubricDimension(id="a", description="d", weight=5)], - ) - - -class TestCreateRubricEvaluatorE2E: - """End-to-end happy path for create_rubric_evaluator with mocked SDK.""" - - async def test_calls_create_version_with_rubric_definition(self, monkeypatch: pytest.MonkeyPatch) -> None: - from agent_framework_foundry import _foundry_evals as fm - - dimension_cls = MagicMock(name="Dimension", side_effect=lambda **kw: ("dim", kw)) - definition_cls = MagicMock(name="RubricBasedEvaluatorDefinition", side_effect=lambda **kw: ("def", kw)) - version_cls = MagicMock(name="EvaluatorVersion", side_effect=lambda **kw: ("ver", kw)) - - sdk = fm._ManualRubricSdkTypes( - EvaluatorVersion=version_cls, - RubricBasedEvaluatorDefinition=definition_cls, - Dimension=dimension_cls, - ) - monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk) - - created_definition = MagicMock() - created_definition.dimensions = [ - MagicMock(dimension_id="intent", description="d1", weight=9, always_applicable=False), - MagicMock(dimension_id="general_quality", description="g", weight=5, always_applicable=True), - ] - created_definition.pass_threshold = 0.7 - created_version = MagicMock( - display_name="DN", - description="hand-authored", - ) - created_version.name = "policy-eval" - created_version.version = "3" - created_version.definition = created_definition - - evaluators_ops = MagicMock() - evaluators_ops.create_version = AsyncMock(return_value=created_version) - project_client = MagicMock() - project_client.beta = MagicMock(evaluators=evaluators_ops) - - ref = await FoundryEvals.create_rubric_evaluator( - project_client=project_client, - name="policy-eval", - dimensions=[ - RubricDimension(id="intent", description="d1", weight=9), - RubricDimension(id="general_quality", description="g", weight=5, always_applicable=True), - ], - category="quality", - pass_threshold=0.7, - display_name="DN", - description="hand-authored", - tags={"team": "agents"}, - metadata={"source": "manual"}, - ) - - # Returned ref carries the persisted (name, version) and snapshot of dimensions. - assert ref.name == "policy-eval" - assert ref.version == "3" - assert ref.category == "quality" - assert ref.pass_threshold == 0.7 - assert ref.dimensions is not None - assert [d.id for d in ref.dimensions] == ["intent", "general_quality"] - assert ref.dimensions[1].always_applicable is True - - # Dimension construction used dimension_id, included always_applicable only when True. - assert dimension_cls.call_count == 2 - first_kwargs = dimension_cls.call_args_list[0].kwargs - assert first_kwargs == {"dimension_id": "intent", "description": "d1", "weight": 9} - second_kwargs = dimension_cls.call_args_list[1].kwargs - assert second_kwargs == { - "dimension_id": "general_quality", - "description": "g", - "weight": 5, - "always_applicable": True, - } - - # Definition construction forwarded pass_threshold and the two sdk dimensions. - definition_cls.assert_called_once() - def_kwargs = definition_cls.call_args.kwargs - assert def_kwargs["pass_threshold"] == 0.7 - assert def_kwargs["dimensions"] == [ - ("dim", {"dimension_id": "intent", "description": "d1", "weight": 9}), - ( - "dim", - { - "dimension_id": "general_quality", - "description": "g", - "weight": 5, - "always_applicable": True, - }, - ), - ] - - # EvaluatorVersion construction passed evaluator_type="custom", category list, and optionals. - version_cls.assert_called_once() - ver_kwargs = version_cls.call_args.kwargs - assert ver_kwargs["evaluator_type"] == "custom" - assert ver_kwargs["categories"] == ["quality"] - assert ver_kwargs["display_name"] == "DN" - assert ver_kwargs["description"] == "hand-authored" - assert ver_kwargs["tags"] == {"team": "agents"} - assert ver_kwargs["metadata"] == {"source": "manual"} - - # SDK ops invoked with name + evaluator_version kwarg. - evaluators_ops.create_version.assert_awaited_once() - call = evaluators_ops.create_version.await_args - assert call.args == ("policy-eval",) - assert "evaluator_version" in call.kwargs - - async def test_omits_pass_threshold_when_not_set(self, monkeypatch: pytest.MonkeyPatch) -> None: - from agent_framework_foundry import _foundry_evals as fm - - dimension_cls = MagicMock(side_effect=lambda **kw: kw) - definition_cls = MagicMock(side_effect=lambda **kw: kw) - version_cls = MagicMock(side_effect=lambda **kw: kw) - - sdk = fm._ManualRubricSdkTypes( - EvaluatorVersion=version_cls, - RubricBasedEvaluatorDefinition=definition_cls, - Dimension=dimension_cls, - ) - monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk) - - created = MagicMock(display_name=None, description=None) - created.name = "x" - created.version = "1" - created.definition = MagicMock(dimensions=[], pass_threshold=None) - - evaluators_ops = MagicMock() - evaluators_ops.create_version = AsyncMock(return_value=created) - project_client = MagicMock() - project_client.beta = MagicMock(evaluators=evaluators_ops) - - ref = await FoundryEvals.create_rubric_evaluator( - project_client=project_client, - name="x", - dimensions=[RubricDimension(id="a", description="d", weight=5)], - ) - assert ref.pass_threshold is None - assert "pass_threshold" not in definition_cls.call_args.kwargs diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md index b7f8f7cc1b6..4ef22f6ee66 100644 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md @@ -35,25 +35,31 @@ Evaluate what already happened — zero changes to agent code: uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py ``` -### `evaluate_with_generated_rubric_sample.py` — Auto-Generate a Rubric - -Let Foundry draft the rubric dimensions for you from the agent's -context (instructions, tools, description). Best when you don't yet -have a fixed scoring rubric and want a strong baseline you can refine. - -```bash -uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py +### Referencing a rubric evaluator created in Foundry + +Foundry users can create rubric evaluators in the Foundry portal (or +through the dedicated SDK / REST surface) — see +[Rubric evaluators](https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/rubric-evaluators) +for the authoring flow. Once an evaluator exists, agent-framework +consumes it like any other evaluator: pass a +`GeneratedEvaluatorRef(name=..., version=...)` in the `evaluators=` +list and pin the version for reproducible runs. + +```python +from agent_framework.foundry import FoundryEvals, GeneratedEvaluatorRef + +evals = FoundryEvals( + evaluators=[ + GeneratedEvaluatorRef(name="reservation-policy-rubric", version="3"), + "relevance", + "coherence", + ], +) ``` -### `evaluate_with_manual_rubric_sample.py` — Author a Rubric Yourself - -Bring your own `RubricDimension`s (from a spec, a competing framework, -or hand tuning) and register them as a versioned evaluator. Use this -when you already know what you want to score. - -```bash -uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py -``` +Quality gates on rubric output use the standard `EvalResults` helpers, +including `assert_dimension_score_at_least(...)` for per-dimension +thresholds. ## Setup @@ -64,5 +70,4 @@ Create a `.env` file with configuration as in the `.env.example` file in this fo - **"I want to test my agent during development"** → `evaluate_agent_sample.py`, Pattern 1 - **"I want to evaluate past agent runs"** → `evaluate_traces_sample.py` - **"I want to inspect/modify eval data before submitting"** → `evaluate_agent_sample.py`, Pattern 2 -- **"I want Foundry to draft a custom rubric for my agent"** → `evaluate_with_generated_rubric_sample.py` -- **"I already have a rubric I want to bring into Foundry"** → `evaluate_with_manual_rubric_sample.py` +- **"I want to score against a custom rubric I created in Foundry"** → pass a `GeneratedEvaluatorRef` (see snippet above) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py deleted file mode 100644 index 9c19ff552ba..00000000000 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -"""Generate a Foundry rubric evaluator from an agent and use it in CI. - -This sample demonstrates the end-to-end adaptive-evals flow: - -1. Build an agent. -2. Generate a rubric evaluator from the agent using - ``FoundryEvals.generate_rubric()`` — produces a pinned - ``GeneratedEvaluatorRef`` you can store in source control. -3. Use the pinned reference in ``evaluators=[...]`` for a regression - run alongside built-in evaluators. -4. Assert quality gates with ``assert_score_at_least`` / - ``assert_dimension_score_at_least`` / ``assert_no_failed_items``. - -A companion ``evaluators.yaml`` shows the source-controlled config -pattern for CI. Load it with :func:`load_evals_config` and pass the -resulting spec through :func:`build_sources` to keep generation -parameters out of code. - -Prerequisites: -- An Azure AI Foundry project with a deployed model. -- ``azure-ai-projects`` build that includes the rubric-generation APIs. -- Set ``FOUNDRY_PROJECT_ENDPOINT`` and ``FOUNDRY_MODEL`` in ``.env``. - -Run with: - -.. code-block:: bash - - az login - python evaluate_with_generated_rubric_sample.py -""" - -import asyncio -import os -import textwrap -from pathlib import Path - -from agent_framework import evaluate_agent -from agent_framework.foundry import ( - FoundryChatClient, - FoundryEvals, - build_sources, - load_evals_config, -) -from azure.ai.projects.aio import AIProjectClient -from azure.identity.aio import AzureCliCredential -from dotenv import load_dotenv - -load_dotenv() - - -def get_weather(location: str) -> str: - """Get the current weather for a location.""" - samples = { - "seattle": "62F, cloudy with a chance of rain", - "london": "55F, overcast", - "paris": "68F, partly sunny", - } - return samples.get(location.lower(), f"Weather data not available for {location}") - - -SAMPLE_YAML = textwrap.dedent( - """\ - evaluators: - travel-quality: - type: foundry.generated_rubric - category: quality - model: gpt-4o - display_name: Travel Quality Rubric - description: Custom rubric tailored to the travel-assistant agent. - sources: - - type: agent - include_instructions: true - include_tools: true - """ -) - - -async def main() -> None: - project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] - model_name = os.environ.get("FOUNDRY_MODEL", "gpt-4o") - - credential = AzureCliCredential() - chat_client = FoundryChatClient( - project_endpoint=project_endpoint, - model=model_name, - credential=credential, - ) - project_client = AIProjectClient(endpoint=project_endpoint, credential=credential) - - agent = chat_client.as_agent( - name="travel-assistant", - instructions=( - "You are a helpful travel assistant. Always ground recommendations in tool output, " - "cite each tool result, and refuse questions outside travel planning." - ), - tools=[get_weather], - ) - - # 1. Load the source-controlled evaluator config. - config_path = Path(__file__).with_name("evaluators.yaml") - if not config_path.exists(): - config_path.write_text(SAMPLE_YAML, encoding="utf-8") - print(f"Wrote sample config to {config_path}") - config = load_evals_config(config_path) - spec = config["travel-quality"] - - # 2. Generate (or refresh) the rubric evaluator. In CI you typically run - # this once and commit the returned name/version pair. - print("Generating rubric evaluator from agent + spec...") - sources = build_sources(spec, agent=agent) - rubric_ref = await FoundryEvals.generate_rubric( - project_client=project_client, - name=spec.name, - sources=sources, - category=spec.category, - model=spec.model, - display_name=spec.display_name, - description=spec.description, - ) - print(f"Generated rubric {rubric_ref.name}@{rubric_ref.version} with {len(rubric_ref.dimensions or ())} dimensions") - - # 3. Run an evaluation that combines built-ins with the new rubric. - evals = FoundryEvals( - client=chat_client, - evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY, rubric_ref], - ) - results = await evaluate_agent( - agent=agent, - queries=[ - "What's the weather in Seattle?", - "Should I pack an umbrella for London?", - ], - evaluators=evals, - ) - - # 4. Quality gates — wire these into your CI job's exit status. - for r in results: - print(f"\nRun {r.run_id}: {r.passed}/{r.total} passed; portal: {r.report_url}") - r.assert_no_failed_items() - r.assert_score_at_least(0.8) - if rubric_ref.dimensions: - r.assert_dimension_score_at_least(rubric_ref.dimensions[0].id, 3) - - await project_client.close() - await credential.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py deleted file mode 100644 index e1fc86ef71c..00000000000 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -"""Register a hand-authored rubric evaluator and use it in CI. - -This sample demonstrates the *manual* counterpart to -``evaluate_with_generated_rubric_sample.py``: - -1. Build an agent. -2. Author the rubric dimensions yourself — useful when you have an - established scoring rubric (from a spec, a competing framework, or - prior hand tuning) that you want to bring into Foundry as-is. -3. Register the rubric with - :meth:`FoundryEvals.create_rubric_evaluator` — this maps directly to - ``project_client.beta.evaluators.create_version`` and returns a - pinned ``GeneratedEvaluatorRef`` you can store in source control. -4. Use the pinned reference in ``evaluators=[...]`` for a regression run - alongside built-in evaluators. - -The service auto-attaches a non-editable residual dimension -(``general_quality`` for ``category="quality"``, -``general_policy_compliance`` for ``"safety"``) — do not include it in -``dimensions``. - -Prefer :meth:`FoundryEvals.generate_rubric` if you want Foundry to -draft the dimensions for you from the agent's context. Use this manual -flow when you already know what you want to score. - -Prerequisites: -- An Azure AI Foundry project with a deployed model. -- ``azure-ai-projects`` build that includes the rubric APIs (currently - ``2.3.0a*`` on the Azure SDK Python dev feed). -- Set ``FOUNDRY_PROJECT_ENDPOINT`` and ``FOUNDRY_MODEL`` in ``.env``. - -Run with: - -.. code-block:: bash - - az login - python evaluate_with_manual_rubric_sample.py -""" - -import asyncio -import os - -from agent_framework import evaluate_agent -from agent_framework.foundry import ( - FoundryChatClient, - FoundryEvals, - RubricDimension, -) -from azure.ai.projects.aio import AIProjectClient -from azure.identity.aio import AzureCliCredential -from dotenv import load_dotenv - -load_dotenv() - - -def get_weather(location: str) -> str: - """Get the current weather for a location.""" - samples = { - "seattle": "62F, cloudy with a chance of rain", - "london": "55F, overcast", - "paris": "68F, partly sunny", - } - return samples.get(location.lower(), f"Weather data not available for {location}") - - -# Hand-authored rubric — this is the artifact you commit alongside the -# agent so the rubric and the behavior it scores evolve together. -# Weights are 1-10 (the generation pipeline biases one dimension to -# 8-10; manual edits aren't constrained by this heuristic). -TRAVEL_RUBRIC_DIMENSIONS: list[RubricDimension] = [ - RubricDimension( - id="tool_grounding", - description=( - "Grounds every weather claim in tool output. Does not invent values when " - "the tool returns no data, and does not paraphrase tool output in a way " - "that distorts the underlying values." - ), - weight=9, - ), - RubricDimension( - id="scope_adherence", - description=( - "Stays within travel-planning scope. Politely declines or redirects " - "questions about topics unrelated to travel (e.g. general trivia, " - "personal advice, coding questions)." - ), - weight=6, - ), - RubricDimension( - id="actionable_recommendation", - description=( - "Provides a clear, actionable recommendation grounded in the tool result " - "(e.g. 'Pack an umbrella' when rain is reported), not just a restatement " - "of the raw weather data." - ), - weight=4, - ), -] - - -async def main() -> None: - project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] - model_name = os.environ.get("FOUNDRY_MODEL", "gpt-4o") - - credential = AzureCliCredential() - chat_client = FoundryChatClient( - project_endpoint=project_endpoint, - model=model_name, - credential=credential, - ) - project_client = AIProjectClient(endpoint=project_endpoint, credential=credential) - - agent = chat_client.as_agent( - name="travel-assistant", - instructions=( - "You are a helpful travel assistant. Always ground recommendations in " - "tool output, cite each tool result, and refuse questions outside travel " - "planning." - ), - tools=[get_weather], - ) - - # 1. Register (or bump the version of) the hand-authored rubric. - # The service auto-attaches the non-editable `general_quality` - # residual dimension for quality rubrics. - print("Registering manual rubric evaluator...") - rubric_ref = await FoundryEvals.create_rubric_evaluator( - project_client=project_client, - name="travel-quality-manual", - dimensions=TRAVEL_RUBRIC_DIMENSIONS, - category="quality", - pass_threshold=0.6, - display_name="Travel Quality (Manual)", - description="Hand-authored rubric for the travel-assistant agent.", - ) - print( - f"Registered rubric {rubric_ref.name}@{rubric_ref.version} " - f"with {len(rubric_ref.dimensions or ())} dimensions " - f"(pass_threshold={rubric_ref.pass_threshold})" - ) - - # 2. Run an evaluation that combines built-ins with the new rubric. - evals = FoundryEvals( - client=chat_client, - evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY, rubric_ref], - ) - results = await evaluate_agent( - agent=agent, - queries=[ - "What's the weather in Seattle?", - "Should I pack an umbrella for London?", - "What's the capital of France?", # off-scope — exercises scope_adherence - ], - evaluators=evals, - ) - - # 3. Quality gates — wire these into your CI job's exit status. - for r in results: - print(f"\nRun {r.run_id}: {r.passed}/{r.total} passed; portal: {r.report_url}") - r.assert_no_failed_items() - r.assert_score_at_least(0.7) - r.assert_dimension_score_at_least("tool_grounding", 3) - r.assert_dimension_score_at_least("scope_adherence", 3) - - await project_client.close() - await credential.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml deleted file mode 100644 index f3e698c77ce..00000000000 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml +++ /dev/null @@ -1,11 +0,0 @@ -evaluators: - travel-quality: - type: foundry.generated_rubric - category: quality - model: gpt-4o - display_name: Travel Quality Rubric - description: Custom rubric tailored to the travel-assistant agent. - sources: - - type: agent - include_instructions: true - include_tools: true From 907c9092b91a8465f7248bc955714e52e2673b0c Mon Sep 17 00:00:00 2001 From: alliscode <25218250+alliscode@users.noreply.github.com> Date: Thu, 28 May 2026 10:16:32 -0700 Subject: [PATCH 13/16] samples(foundry-evals): add evaluate_with_rubric_sample Adds a runnable end-to-end sample showing how to consume a pre-existing rubric evaluator created in Foundry: reference it with GeneratedEvaluatorRef(name, version), mix it with built-in evaluators in FoundryEvals, and gate CI with assert_dimension_score_at_least on a specific dimension. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../evaluation/foundry_evals/.env.example | 9 ++ .../evaluation/foundry_evals/README.md | 6 +- .../evaluate_with_rubric_sample.py | 138 ++++++++++++++++++ 3 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example index b6a8af233e8..388350edea2 100644 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example @@ -1,3 +1,12 @@ FOUNDRY_PROJECT_ENDPOINT="" FOUNDRY_MODEL="" +# Only needed for evaluate_with_rubric_sample.py — connects to the +# pre-existing Foundry agent that the rubric evaluator was created against. +FOUNDRY_AGENT_NAME="" +FOUNDRY_AGENT_VERSION="" + +# Only needed for evaluate_with_rubric_sample.py — references a rubric +# evaluator you created in Foundry. Pin the version for reproducible runs. +FOUNDRY_RUBRIC_NAME="" +FOUNDRY_RUBRIC_VERSION="" \ No newline at end of file diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md index 4ef22f6ee66..2f47c468612 100644 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md @@ -61,6 +61,10 @@ Quality gates on rubric output use the standard `EvalResults` helpers, including `assert_dimension_score_at_least(...)` for per-dimension thresholds. +See [`evaluate_with_rubric_sample.py`](./evaluate_with_rubric_sample.py) +for a runnable end-to-end example that combines a rubric evaluator with +built-in evaluators and gates a per-dimension threshold. + ## Setup Create a `.env` file with configuration as in the `.env.example` file in this folder. @@ -70,4 +74,4 @@ Create a `.env` file with configuration as in the `.env.example` file in this fo - **"I want to test my agent during development"** → `evaluate_agent_sample.py`, Pattern 1 - **"I want to evaluate past agent runs"** → `evaluate_traces_sample.py` - **"I want to inspect/modify eval data before submitting"** → `evaluate_agent_sample.py`, Pattern 2 -- **"I want to score against a custom rubric I created in Foundry"** → pass a `GeneratedEvaluatorRef` (see snippet above) +- **"I want to score against a custom rubric I created in Foundry"** → `evaluate_with_rubric_sample.py` diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py new file mode 100644 index 00000000000..06ec5c9bdd7 --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py @@ -0,0 +1,138 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate a Foundry agent against a rubric evaluator that was created in Foundry. + +Rubric evaluators are LLM-as-judge evaluators with custom scoring dimensions +that you define for your domain. agent-framework consumes pre-existing rubric +evaluators — they are authored in the Foundry portal (or via the dedicated +SDK / REST surface) and referenced here by name and version. + +See: https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/rubric-evaluators + +This sample demonstrates: +1. Connecting to a pre-existing Foundry agent (PromptAgent or HostedAgent). +2. Referencing a pre-existing rubric evaluator by ``name`` and ``version``. +3. Mixing the rubric with built-in Foundry evaluators in one run. +4. Asserting per-dimension thresholds with + ``EvalResults.assert_dimension_score_at_least(...)`` for CI quality gates. + +Starting condition / prerequisites: +- An Azure AI Foundry project with a deployed model. +- A registered Foundry agent (PromptAgent or HostedAgent) in that project. + This is the agent the rubric is meant to evaluate. +- A rubric evaluator already created in the Foundry portal against that + agent. Creating rubrics through the portal currently requires picking a + Foundry agent as the generation context, so this prerequisite is implied + by having a rubric at all. +- Set the following in .env (see ``.env.example``): + - ``FOUNDRY_PROJECT_ENDPOINT`` + - ``FOUNDRY_AGENT_NAME`` and ``FOUNDRY_AGENT_VERSION`` for the agent + - ``FOUNDRY_RUBRIC_NAME`` and ``FOUNDRY_RUBRIC_VERSION`` for the rubric + - ``FOUNDRY_MODEL`` for the rubric judge model +""" + +import asyncio +import os + +from agent_framework import EvalNotPassedError, evaluate_agent +from agent_framework.foundry import FoundryAgent, FoundryChatClient, FoundryEvals, GeneratedEvaluatorRef +from azure.identity import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv(override=True) + + +async def main() -> None: + # 1. Connect to the existing Foundry agent that the rubric was created + # against. PromptAgents and HostedAgents are both supported. + credential = AzureCliCredential() + project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] + + agent = FoundryAgent( + project_endpoint=project_endpoint, + agent_name=os.environ["FOUNDRY_AGENT_NAME"], + agent_version=os.environ.get("FOUNDRY_AGENT_VERSION"), + credential=credential, + ) + + # 2. Reference the pre-existing rubric evaluator by name + version. + # Always pin a version for reproducible CI runs; versionless refs + # resolve to "latest" and emit a warning at evaluation time. + rubric_name = os.environ["FOUNDRY_RUBRIC_NAME"] + rubric_version = os.environ["FOUNDRY_RUBRIC_VERSION"] + rubric = GeneratedEvaluatorRef(name=rubric_name, version=rubric_version) + + # 3. Mix the rubric with built-in evaluators in a single FoundryEvals + # config. FoundryEvals talks to Foundry over the project endpoint, so + # we hand it a FoundryChatClient configured with the same credential. + eval_client = FoundryChatClient( + project_endpoint=project_endpoint, + model=os.environ["FOUNDRY_MODEL"], + credential=credential, + ) + evals = FoundryEvals( + client=eval_client, + evaluators=[ + rubric, + FoundryEvals.RELEVANCE, + FoundryEvals.COHERENCE, + ], + ) + + # ========================================================================= + # Run evaluation + # ========================================================================= + print("=" * 60) + print(f"Evaluating '{agent.name}' with rubric '{rubric_name}' (version {rubric_version})") + print("=" * 60) + + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather like in Seattle?", + "Should I bring an umbrella to London tomorrow?", + ], + evaluators=evals, + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + print(f"Portal: {r.report_url}") + if r.all_passed: + print("[PASS] All passed") + else: + print(f"[FAIL] {r.failed} failed") + + # ========================================================================= + # Per-dimension quality gate + # ========================================================================= + # Rubric evaluators emit per-dimension scores (1–5) on top of the overall + # weighted score. Use assert_dimension_score_at_least to gate CI on a + # specific dimension — e.g., never ship if a critical dimension drops + # below 3. + # + # The dimension_id must match an id defined on your rubric in Foundry. + # ``general_quality`` is used here because it's the conventional + # ``always_applicable: true`` dimension in the Foundry docs' example + # rubric — swap it for whatever dimension id(s) your rubric actually + # defines. + print() + print("=" * 60) + print("Per-dimension quality gate") + print("=" * 60) + + for r in results: + try: + r.assert_dimension_score_at_least( + "general_quality", + min_score=3.0, + evaluator=rubric_name, + ) + print(f"[PASS] {r.provider}: general_quality >= 3 on every item") + except EvalNotPassedError as exc: + print(f"[FAIL] {r.provider}: dimension gate tripped: {exc}") + + +if __name__ == "__main__": + asyncio.run(main()) From b6a558d6e2aa8e0bfd4a722cc991552574bb45a4 Mon Sep 17 00:00:00 2001 From: alliscode <25218250+alliscode@users.noreply.github.com> Date: Fri, 29 May 2026 08:34:46 -0700 Subject: [PATCH 14/16] fix(foundry-evals): satisfy mypy on _fetch_output_items mypy infers OutputItemListResponse.sample as dict[str, object] | None while pyright correctly infers the typed Sample model. Cast to Any so both type checkers accept the attribute access pattern, rename the local to avoid shadowing the inner-loop sample binding, and drop the now-stale pyright suppressions. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../agent_framework_foundry/_foundry_evals.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py index f242db06d91..8059c2ce990 100644 --- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -598,15 +598,18 @@ async def _fetch_output_items( output_text: str | None = None response_id: str | None = None - sample = oi.sample - if sample is not None: # pyright: ignore[reportUnnecessaryComparison] - err = sample.error - if err is not None and (err.code or err.message): # pyright: ignore[reportUnnecessaryComparison] + # mypy infers oi.sample as dict[str, object] | None, but the + # OpenAI SDK actually returns a typed Sample model. Cast to Any so + # both type checkers accept the attribute access pattern. + oi_sample: Any = oi.sample + if oi_sample is not None: + err = oi_sample.error + if err is not None and (err.code or err.message): error_code = err.code or None error_message = err.message or None - usage = sample.usage - if usage is not None and usage.total_tokens: # pyright: ignore[reportUnnecessaryComparison] + usage = oi_sample.usage + if usage is not None and usage.total_tokens: token_usage = { "prompt_tokens": usage.prompt_tokens, "completion_tokens": usage.completion_tokens, @@ -615,13 +618,13 @@ async def _fetch_output_items( } # Extract input/output text - if sample.input: - parts = [si.content for si in sample.input if si.role == "user"] + if oi_sample.input: + parts = [si.content for si in oi_sample.input if si.role == "user"] if parts: input_text = " ".join(parts) - if sample.output: - parts = [so.content or "" for so in sample.output if so.role == "assistant"] + if oi_sample.output: + parts = [so.content or "" for so in oi_sample.output if so.role == "assistant"] if parts: output_text = " ".join(parts) From 93cf732b48a6c6a24013564014f2ecaae66d22af Mon Sep 17 00:00:00 2001 From: alliscode <25218250+alliscode@users.noreply.github.com> Date: Fri, 29 May 2026 08:39:36 -0700 Subject: [PATCH 15/16] docs(foundry-evals): drop unpublished rubric-evaluators learn.microsoft.com link The Adaptive Evals authoring docs are not yet published on Microsoft Learn, so the link 404s. Keep the descriptive text without the broken hyperlink; we can re-add it once the docs ship. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../05-end-to-end/evaluation/foundry_evals/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md index 2f47c468612..e30ce6aa464 100644 --- a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md @@ -38,10 +38,8 @@ uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py ### Referencing a rubric evaluator created in Foundry Foundry users can create rubric evaluators in the Foundry portal (or -through the dedicated SDK / REST surface) — see -[Rubric evaluators](https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/rubric-evaluators) -for the authoring flow. Once an evaluator exists, agent-framework -consumes it like any other evaluator: pass a +through the dedicated SDK / REST surface). Once an evaluator exists, +agent-framework consumes it like any other evaluator: pass a `GeneratedEvaluatorRef(name=..., version=...)` in the `evaluators=` list and pin the version for reproducible runs. From fb3eb7fe7d9bc34d255fbb4cc66b9cfa763b2f28 Mon Sep 17 00:00:00 2001 From: alliscode <25218250+alliscode@users.noreply.github.com> Date: Mon, 1 Jun 2026 11:43:51 -0700 Subject: [PATCH 16/16] test(foundry-evals): hoist repeated local imports to module top Per code review feedback (eavanvalkenburg): the test file repeated 'from agent_framework_foundry._foundry_evals import ...' inside 22 test bodies and 'from agent_framework_foundry import GeneratedEvaluatorRef' inside 8 more. Move all of them to the existing top-level imports; the symbols are the same across tests and the local imports were redundant. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../foundry/tests/test_foundry_evals.py | 39 +++++-------------- 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py index df8627352bb..8734650aafb 100644 --- a/python/packages/foundry/tests/test_foundry_evals.py +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -25,16 +25,25 @@ from agent_framework._workflows._workflow import WorkflowRunResult from openai import AsyncOpenAI +from agent_framework_foundry import GeneratedEvaluatorRef from agent_framework_foundry._foundry_evals import ( + _AGENT_EVALUATORS, + _BUILTIN_EVALUATORS, + _TOOL_EVALUATORS, FoundryEvals, _build_item_schema, _build_testing_criteria, _extract_per_evaluator, _extract_result_counts, + _extract_rubric_scores, + _fetch_output_items, _filter_tool_evaluators, + _poll_eval_run, _resolve_default_evaluators, _resolve_evaluator, _resolve_openai_client, + evaluate_foundry_target, + evaluate_traces, ) @@ -807,7 +816,6 @@ def test_all_tool_evaluators_include_tool_definitions(self) -> None: assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions" def test_generated_evaluator_ref_pinned_version(self) -> None: - from agent_framework_foundry import GeneratedEvaluatorRef ref = GeneratedEvaluatorRef(name="my-rubric", version="1") criteria = _build_testing_criteria([ref], "gpt-4o", include_data_mapping=True) @@ -825,7 +833,6 @@ def test_generated_evaluator_ref_pinned_version(self) -> None: } def test_generated_evaluator_ref_display_name_used_as_short(self) -> None: - from agent_framework_foundry import GeneratedEvaluatorRef ref = GeneratedEvaluatorRef(name="my-rubric", version="2", display_name="My Rubric") criteria = _build_testing_criteria([ref], "gpt-4o") @@ -834,7 +841,6 @@ def test_generated_evaluator_ref_display_name_used_as_short(self) -> None: assert criteria[0]["evaluator_name"] == "my-rubric" def test_generated_evaluator_ref_tool_definitions_added(self) -> None: - from agent_framework_foundry import GeneratedEvaluatorRef ref = GeneratedEvaluatorRef(name="my-rubric", version="1") criteria = _build_testing_criteria( @@ -849,8 +855,6 @@ def test_generated_evaluator_ref_tool_definitions_added(self) -> None: def test_generated_evaluator_ref_unpinned_warns(self, caplog: pytest.LogCaptureFixture) -> None: import logging - from agent_framework_foundry import GeneratedEvaluatorRef - ref = GeneratedEvaluatorRef.latest("my-rubric") with caplog.at_level(logging.WARNING, logger="agent_framework_foundry._foundry_evals"): criteria = _build_testing_criteria([ref], "gpt-4o") @@ -859,7 +863,6 @@ def test_generated_evaluator_ref_unpinned_warns(self, caplog: pytest.LogCaptureF assert any("no pinned version" in r.message for r in caplog.records) def test_generated_evaluator_ref_mixed_with_builtins(self) -> None: - from agent_framework_foundry import GeneratedEvaluatorRef ref = GeneratedEvaluatorRef(name="my-rubric", version="1") criteria = _build_testing_criteria( @@ -1331,7 +1334,6 @@ def test_raises_when_all_filtered(self) -> None: ) def test_preserves_generated_ref_when_no_tools(self) -> None: - from agent_framework_foundry import GeneratedEvaluatorRef ref = GeneratedEvaluatorRef(name="rubric", version="1") items = [ @@ -1346,7 +1348,6 @@ def test_preserves_generated_ref_when_no_tools(self) -> None: assert "tool_call_accuracy" not in result def test_generated_ref_alone_does_not_raise(self) -> None: - from agent_framework_foundry import GeneratedEvaluatorRef ref = GeneratedEvaluatorRef(name="rubric", version="1") items = [ @@ -2359,7 +2360,6 @@ def test_raise_for_status_includes_errored_items(self) -> None: class TestFetchOutputItems: async def test_fetches_and_converts_output_items(self) -> None: - from agent_framework_foundry._foundry_evals import _fetch_output_items # Build mock output items matching the OpenAI SDK schema mock_result = MagicMock() @@ -2421,7 +2421,6 @@ async def test_fetches_and_converts_output_items(self) -> None: assert item.error_code is None async def test_handles_errored_item(self) -> None: - from agent_framework_foundry._foundry_evals import _fetch_output_items mock_error = MagicMock() mock_error.code = "QueryExtractionError" @@ -2453,7 +2452,6 @@ async def test_handles_errored_item(self) -> None: assert len(item.scores) == 0 async def test_handles_api_failure_gracefully(self) -> None: - from agent_framework_foundry._foundry_evals import _fetch_output_items mock_client = MagicMock() mock_client.evals.runs.output_items.list = AsyncMock(side_effect=TypeError("API error")) @@ -2462,7 +2460,6 @@ async def test_handles_api_failure_gracefully(self) -> None: assert items == [] async def test_extracts_rubric_scores_from_dict_sample(self) -> None: - from agent_framework_foundry._foundry_evals import _fetch_output_items mock_result = MagicMock() mock_result.name = "my-rubric" @@ -2504,7 +2501,6 @@ async def test_extracts_rubric_scores_from_dict_sample(self) -> None: assert safety.applicable is False async def test_no_rubric_scores_when_absent(self) -> None: - from agent_framework_foundry._foundry_evals import _fetch_output_items mock_result = MagicMock() mock_result.name = "relevance" @@ -2529,7 +2525,6 @@ async def test_no_rubric_scores_when_absent(self) -> None: class TestExtractRubricScores: def test_handles_attribute_style_properties(self) -> None: - from agent_framework_foundry._foundry_evals import _extract_rubric_scores rs = MagicMock() rs.id = "policy" @@ -2549,7 +2544,6 @@ def test_handles_attribute_style_properties(self) -> None: assert result[0].weight == 2 def test_top_level_rubric_scores_in_dict(self) -> None: - from agent_framework_foundry._foundry_evals import _extract_rubric_scores sample = {"rubric_scores": [{"id": "a", "score": 3, "applicable": True, "weight": 1, "reason": "r"}]} result = _extract_rubric_scores(sample) @@ -2557,14 +2551,12 @@ def test_top_level_rubric_scores_in_dict(self) -> None: assert result[0].id == "a" def test_returns_none_when_missing(self) -> None: - from agent_framework_foundry._foundry_evals import _extract_rubric_scores assert _extract_rubric_scores(None) is None assert _extract_rubric_scores({}) is None assert _extract_rubric_scores({"properties": {}}) is None def test_skips_malformed_entries(self) -> None: - from agent_framework_foundry._foundry_evals import _extract_rubric_scores sample = { "properties": { @@ -2581,7 +2573,6 @@ def test_skips_malformed_entries(self) -> None: def test_canonical_dimension_scores_key_from_docs(self) -> None: """Per the Microsoft Learn docs, runtime output uses ``properties.dimension_scores``.""" - from agent_framework_foundry._foundry_evals import _extract_rubric_scores sample = { "properties": { @@ -2611,7 +2602,6 @@ def test_canonical_dimension_scores_key_from_docs(self) -> None: def test_dimension_scores_via_attribute(self) -> None: """Canonical key also resolves when properties exposes ``dimension_scores`` as an attr.""" - from agent_framework_foundry._foundry_evals import _extract_rubric_scores rs = MagicMock() rs.id = "policy_enforcement" @@ -2638,7 +2628,6 @@ def test_dimension_scores_via_attribute(self) -> None: class TestPollEvalRun: async def test_timeout_returns_timeout_status(self) -> None: """Poll timeout returns EvalResults with status='timeout'.""" - from agent_framework_foundry._foundry_evals import _poll_eval_run mock_client = MagicMock() mock_pending = MagicMock() @@ -2652,7 +2641,6 @@ async def test_timeout_returns_timeout_status(self) -> None: async def test_failed_run_returns_error(self) -> None: """Failed run returns EvalResults with error message.""" - from agent_framework_foundry._foundry_evals import _poll_eval_run mock_client = MagicMock() mock_failed = MagicMock() @@ -2670,7 +2658,6 @@ async def test_failed_run_returns_error(self) -> None: async def test_canceled_run_returns_canceled_status(self) -> None: """Canceled run returns EvalResults with status='canceled'.""" - from agent_framework_foundry._foundry_evals import _poll_eval_run mock_client = MagicMock() mock_canceled = MagicMock() @@ -2695,7 +2682,6 @@ async def test_canceled_run_returns_canceled_status(self) -> None: class TestEvaluateTraces: async def test_raises_without_required_args(self) -> None: """Raises ValueError when no response_ids, trace_ids, or agent_id given.""" - from agent_framework_foundry._foundry_evals import evaluate_traces mock_client = MagicMock() with pytest.raises(ValueError, match="Provide at least one of"): @@ -2706,7 +2692,6 @@ async def test_raises_without_required_args(self) -> None: async def test_response_ids_path(self) -> None: """evaluate_traces with response_ids uses the responses API path.""" - from agent_framework_foundry._foundry_evals import evaluate_traces mock_client = MagicMock() @@ -2754,7 +2739,6 @@ async def test_response_ids_path(self) -> None: async def test_trace_ids_path(self) -> None: """evaluate_traces with trace_ids builds azure_ai_traces data source.""" - from agent_framework_foundry._foundry_evals import evaluate_traces mock_client = MagicMock() @@ -2794,7 +2778,6 @@ async def test_trace_ids_path(self) -> None: class TestEvaluateFoundryTarget: async def test_happy_path(self) -> None: """evaluate_foundry_target creates eval + run and polls to completion.""" - from agent_framework_foundry._foundry_evals import evaluate_foundry_target mock_client = MagicMock() @@ -2930,13 +2913,11 @@ class TestEvaluatorSetConsistency: """Verify that _AGENT_EVALUATORS and _TOOL_EVALUATORS are subsets of _BUILTIN_EVALUATORS.""" def test_agent_evaluators_subset(self): - from agent_framework_foundry._foundry_evals import _AGENT_EVALUATORS, _BUILTIN_EVALUATORS diff = _AGENT_EVALUATORS - set(_BUILTIN_EVALUATORS.values()) assert not diff, f"_AGENT_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}" def test_tool_evaluators_subset(self): - from agent_framework_foundry._foundry_evals import _BUILTIN_EVALUATORS, _TOOL_EVALUATORS diff = _TOOL_EVALUATORS - set(_BUILTIN_EVALUATORS.values()) assert not diff, f"_TOOL_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}" @@ -2950,7 +2931,6 @@ def test_tool_evaluators_subset(self): class TestEvaluateTracesAgentId: async def test_agent_id_only_path(self) -> None: """evaluate_traces with agent_id only builds azure_ai_traces data source.""" - from agent_framework_foundry._foundry_evals import evaluate_traces mock_client = MagicMock() @@ -3008,7 +2988,6 @@ def test_all_tool_evaluators_no_tools_raises(self): class TestEvaluateFoundryTargetValidation: async def test_target_without_type_raises(self) -> None: """target dict without 'type' key raises ValueError.""" - from agent_framework_foundry._foundry_evals import evaluate_foundry_target mock_client = MagicMock() with pytest.raises(ValueError, match="'type' key"):