From e45b934cc219cbbf0b452d27d10815ac480cf917 Mon Sep 17 00:00:00 2001
From: Ben Thomas <25218250+alliscode@users.noreply.github.com>
Date: Tue, 26 May 2026 17:24:02 -0700
Subject: [PATCH 01/16] Python: feat(evals): RubricScore type +
 EvalScoreResult.dimensions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../packages/core/agent_framework/__init__.py |  2 ++
 .../core/agent_framework/_evaluation.py       | 34 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/python/packages/core/agent_framework/__init__.py b/python/packages/core/agent_framework/__init__.py
index 356051da3ff..52368df476b 100644
--- a/python/packages/core/agent_framework/__init__.py
+++ b/python/packages/core/agent_framework/__init__.py
@@ -70,6 +70,7 @@
     Evaluator,
     ExpectedToolCall,
     LocalEvaluator,
+    RubricScore,
     evaluate_agent,
     evaluate_workflow,
     evaluator,
@@ -425,6 +426,7 @@
     "ResponseStream",
     "Role",
     "RoleLiteral",
+    "RubricScore",
     "RunContext",
     "Runner",
     "RunnerContext",
diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py
index 64fab0eacb6..32ae5bcfba4 100644
--- a/python/packages/core/agent_framework/_evaluation.py
+++ b/python/packages/core/agent_framework/_evaluation.py
@@ -311,12 +311,15 @@ class EvalScoreResult:
         score: Numeric score from the evaluator.
         passed: Whether the item passed this evaluator's threshold.
         sample: Optional raw evaluator output (rationale, metadata).
+        dimensions: Per-dimension scores for rubric-based evaluators.
+            ``None`` for non-rubric (e.g. built-in) evaluators.
     """
 
     name: str
     score: float
     passed: bool | None = None
     sample: dict[str, Any] | None = None
+    dimensions: list[RubricScore] | None = None
 
 
 @experimental(feature_id=ExperimentalFeature.EVALS)
@@ -496,6 +499,37 @@ def raise_for_status(self, msg: str | None = None) -> None:
                     detail += f" Errored items: {', '.join(summaries)}."
             raise EvalNotPassedError(detail)
 
+# endregion
+
+# region Generated rubric evaluators
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class RubricScore:
+    """A single dimension's score from a rubric-based evaluator run.
+
+    Rubric evaluators (e.g. Foundry's generated rubric evaluators) emit
+    one ``RubricScore`` per dimension per item.  Attached to
+    :class:`EvalScoreResult` as a typed view of the raw
+    ``properties.rubric_scores`` payload.
+
+    Attributes:
+        id: Stable identifier for the dimension (e.g.
+            ``"policy_enforcement"``) defined by the rubric.
+        score: Numeric score, or ``None`` when the dimension was marked
+            non-applicable for this item.
+        applicable: Whether the dimension applied to this item.
+        weight: Dimension weight (mirrors the rubric definition).
+        reason: Short rationale produced by the evaluator.
+    """
+
+    id: str
+    score: int | None
+    applicable: bool
+    weight: int
+    reason: str
+
 
 # endregion
 

From e5830dd7fcdcabfb75ce2ca99d47fdb40fa9ef61 Mon Sep 17 00:00:00 2001
From: Ben Thomas <25218250+alliscode@users.noreply.github.com>
Date: Tue, 26 May 2026 17:31:35 -0700
Subject: [PATCH 02/16] Python: feat(foundry-evals): RubricDimension +
 GeneratedEvaluatorRef + accept in evaluators=

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../agent_framework_foundry/__init__.py       |   4 +
 .../agent_framework_foundry/_foundry_evals.py | 192 ++++++++++++++++--
 .../foundry/tests/test_foundry_evals.py       | 104 ++++++++++
 python/uv.lock                                |   2 +-
 4 files changed, 280 insertions(+), 22 deletions(-)

diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py
index 002e63f8a6b..14eebfaffa0 100644
--- a/python/packages/foundry/agent_framework_foundry/__init__.py
+++ b/python/packages/foundry/agent_framework_foundry/__init__.py
@@ -12,6 +12,8 @@
 )
 from ._foundry_evals import (
     FoundryEvals,
+    GeneratedEvaluatorRef,
+    RubricDimension,
     evaluate_foundry_target,
     evaluate_traces,
 )
@@ -32,10 +34,12 @@
     "FoundryEmbeddingSettings",
     "FoundryEvals",
     "FoundryMemoryProvider",
+    "GeneratedEvaluatorRef",
     "RawFoundryAgent",
     "RawFoundryAgentChatClient",
     "RawFoundryChatClient",
     "RawFoundryEmbeddingClient",
+    "RubricDimension",
     "__version__",
     "evaluate_foundry_target",
     "evaluate_traces",
diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index eef58b0a040..9cfcc4bc678 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -29,7 +29,8 @@
 import asyncio
 import logging
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Any
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Literal
 
 from agent_framework._evaluation import (
     AgentEvalConverter,
@@ -51,6 +52,107 @@
 
 logger = logging.getLogger(__name__)
 
+
+# region Generated rubric evaluator types
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class RubricDimension:
+    """A single dimension of a Foundry generated rubric evaluator.
+
+    Rubric evaluators score each item along one or more named dimensions,
+    each with its own description and weight.  Foundry's evaluator
+    generation pipeline produces these dimensions from agent/workflow
+    metadata; agent-framework surfaces them so callers can inspect a
+    generated evaluator's structure without round-tripping through the
+    portal.
+
+    Attributes:
+        id: Stable identifier for the dimension (e.g. ``"policy_enforcement"``).
+        description: Natural-language description of what the dimension scores.
+        weight: Integer weight controlling the dimension's contribution to
+            the aggregate score.
+        always_applicable: When ``False``, evaluators may mark this
+            dimension non-applicable on a per-item basis.
+    """
+
+    id: str
+    description: str
+    weight: int
+    always_applicable: bool = False
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class GeneratedEvaluatorRef:
+    """A reference to a generated rubric evaluator stored in Foundry.
+
+    Pass instances of this class to :class:`FoundryEvals` to score items
+    with a previously generated rubric evaluator.  Construct directly
+    when the evaluator already exists, or obtain one from
+    :meth:`FoundryEvals.generate_rubric`.
+
+    By default ``version`` is required and pinned so an evaluation run is
+    reproducible.  Use :meth:`latest` to opt in to versionless references
+    explicitly.
+
+    Attributes:
+        name: Evaluator name as stored in the Foundry project (e.g.
+            ``"my-policy-evaluator"``).  Distinct from built-in
+            evaluators such as ``"builtin.relevance"``.
+        version: Pinned evaluator version.  ``None`` means "latest" —
+            this is discouraged for CI/repro and ``FoundryEvals`` will
+            emit a warning when used.
+        category: ``"quality"`` for ungrounded rubric scoring,
+            ``"safety"`` for safety-focused evaluators.  Matches the
+            Foundry evaluator's declared category.
+        display_name: Optional human-readable name used in result
+            summaries.  Defaults to ``name`` when unset.
+        description: Optional description carried over from the
+            generated evaluator definition for documentation.
+        dimensions: Optional snapshot of the rubric's dimensions for
+            inspection.  Not required to invoke the evaluator — the
+            service uses the persisted definition.
+        pass_threshold: Optional aggregate score threshold (0.0-1.0) the
+            evaluator considers a passing item.  ``None`` defers to the
+            evaluator's stored default.
+    """
+
+    name: str
+    version: str | None = None
+    category: Literal["quality", "safety"] = "quality"
+    display_name: str | None = None
+    description: str | None = None
+    dimensions: tuple[RubricDimension, ...] | None = None
+    pass_threshold: float | None = None
+
+    @classmethod
+    def latest(
+        cls,
+        name: str,
+        *,
+        category: Literal["quality", "safety"] = "quality",
+        display_name: str | None = None,
+        description: str | None = None,
+    ) -> GeneratedEvaluatorRef:
+        """Construct a versionless reference (resolves to the latest version at run time).
+
+        Discouraged for reproducible runs.  Prefer the constructor with
+        an explicit ``version`` so CI and replay evaluations stay stable
+        when the evaluator is regenerated.
+        """
+        return cls(
+            name=name,
+            version=None,
+            category=category,
+            display_name=display_name,
+            description=description,
+        )
+
+
+# endregion
+
 # Agent evaluators that accept query/response as conversation arrays.
 # Maintained manually — check https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk
 # for the latest evaluator list. These are the evaluators that need conversation-format input.
@@ -166,7 +268,7 @@ def _resolve_evaluator(name: str) -> str:
 
 
 def _build_testing_criteria(
-    evaluators: Sequence[str],
+    evaluators: Sequence[str | GeneratedEvaluatorRef],
     model: str,
     *,
     include_data_mapping: bool = False,
@@ -175,7 +277,9 @@ def _build_testing_criteria(
     """Build ``testing_criteria`` for ``evals.create()``.
 
     Args:
-        evaluators: Evaluator names.
+        evaluators: Evaluator names (built-in shorts / fully-qualified
+            ``builtin.*`` names) or :class:`GeneratedEvaluatorRef`
+            instances for generated rubric evaluators.
         model: Model deployment for the LLM judge.
         include_data_mapping: Whether to include field-level data mapping
             (required for the JSONL data source, not needed for response-based).
@@ -183,7 +287,36 @@ def _build_testing_criteria(
             definitions.
     """
     criteria: list[dict[str, Any]] = []
-    for name in evaluators:
+    for entry_spec in evaluators:
+        if isinstance(entry_spec, GeneratedEvaluatorRef):
+            short = entry_spec.display_name or entry_spec.name
+            ref_entry: dict[str, Any] = {
+                "type": "azure_ai_evaluator",
+                "name": short,
+                "evaluator_name": entry_spec.name,
+                "initialization_parameters": {"deployment_name": model},
+            }
+            if entry_spec.version is not None:
+                ref_entry["evaluator_version"] = entry_spec.version
+            else:
+                logger.warning(
+                    "GeneratedEvaluatorRef '%s' has no pinned version; the eval run "
+                    "will resolve to whichever version is current at execution time. "
+                    "Pin the version for reproducible runs.",
+                    entry_spec.name,
+                )
+            if include_data_mapping:
+                ref_mapping: dict[str, str] = {
+                    "query": "{{item.query_messages}}",
+                    "response": "{{item.response_messages}}",
+                }
+                if include_tool_definitions:
+                    ref_mapping["tool_definitions"] = "{{item.tool_definitions}}"
+                ref_entry["data_mapping"] = ref_mapping
+            criteria.append(ref_entry)
+            continue
+
+        name = entry_spec
         qualified = _resolve_evaluator(name)
         short = name if not name.startswith("builtin.") else name.split(".")[-1]
 
@@ -247,9 +380,9 @@ def _build_item_schema(
 
 
 def _resolve_default_evaluators(
-    evaluators: Sequence[str] | None,
+    evaluators: Sequence[str | GeneratedEvaluatorRef] | None,
     items: Sequence[EvalItem | dict[str, Any]] | None = None,
-) -> list[str]:
+) -> list[str | GeneratedEvaluatorRef]:
     """Resolve evaluators, applying defaults when ``None``.
 
     Defaults to relevance + coherence + task_adherence. Automatically adds
@@ -258,7 +391,7 @@ def _resolve_default_evaluators(
     if evaluators is not None:
         return list(evaluators)
 
-    result = list(_DEFAULT_EVALUATORS)
+    result: list[str | GeneratedEvaluatorRef] = list(_DEFAULT_EVALUATORS)
     if items is not None:
         has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items)
         if has_tools:
@@ -267,14 +400,24 @@ def _resolve_default_evaluators(
 
 
 def _filter_tool_evaluators(
-    evaluators: list[str],
+    evaluators: list[str | GeneratedEvaluatorRef],
     items: Sequence[EvalItem | dict[str, Any]],
-) -> list[str]:
-    """Remove tool evaluators if no items have tool definitions."""
+) -> list[str | GeneratedEvaluatorRef]:
+    """Remove tool evaluators if no items have tool definitions.
+
+    Generated rubric evaluators are tool-aware but not tool-required; they
+    are preserved regardless of whether items carry tool definitions.
+    """
     has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items)
     if has_tools:
         return evaluators
-    filtered = [e for e in evaluators if _resolve_evaluator(e) not in _TOOL_EVALUATORS]
+
+    def _is_tool_only(spec: str | GeneratedEvaluatorRef) -> bool:
+        if isinstance(spec, GeneratedEvaluatorRef):
+            return False
+        return _resolve_evaluator(spec) in _TOOL_EVALUATORS
+
+    filtered = [e for e in evaluators if not _is_tool_only(e)]
     if not filtered:
         raise ValueError(
             f"All requested evaluators {evaluators} require tool definitions, "
@@ -282,7 +425,7 @@ def _filter_tool_evaluators(
             "or choose evaluators that do not require tools."
         )
     if len(filtered) < len(evaluators):
-        removed = [e for e in evaluators if _resolve_evaluator(e) in _TOOL_EVALUATORS]
+        removed = [e for e in evaluators if _is_tool_only(e)]
         logger.info("Removed tool evaluators %s (no items have tools)", removed)
     return filtered
 
@@ -472,7 +615,7 @@ async def _evaluate_via_responses_impl(
     *,
     client: AsyncOpenAI,
     response_ids: Sequence[str],
-    evaluators: list[str],
+    evaluators: list[str | GeneratedEvaluatorRef],
     model: str,
     eval_name: str,
     poll_interval: float,
@@ -573,8 +716,11 @@ class FoundryEvals:
             (from ``azure.ai.projects.aio``).  Provide this or *client*.
         model: Model deployment name for the evaluator LLM judge.
             Resolved from ``client.model`` when omitted.
-        evaluators: Evaluator names (e.g. ``["relevance", "tool_call_accuracy"]``).
-            When ``None`` (default), uses smart defaults based on item data.
+        evaluators: Evaluator specifications.  Entries may be built-in
+            short names (e.g. ``"relevance"``), fully-qualified
+            ``"builtin.*"`` names, or :class:`GeneratedEvaluatorRef`
+            instances for previously generated rubric evaluators.  When
+            ``None`` (default), uses smart defaults based on item data.
         conversation_split: How to split multi-turn conversations into
             query/response halves.  Defaults to ``LAST_TURN``.  Pass a
             ``ConversationSplit`` enum value or a custom callable — see
@@ -623,7 +769,7 @@ def __init__(
         client: FoundryChatClient | None = None,
         project_client: AIProjectClient | None = None,
         model: str | None = None,
-        evaluators: Sequence[str] | None = None,
+        evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None,
         conversation_split: ConversationSplitter = ConversationSplit.LAST_TURN,
         poll_interval: float = 5.0,
         timeout: float = 180.0,
@@ -642,7 +788,9 @@ def __init__(
                 "Model is required. Pass model= explicitly or use a FoundryChatClient that has a model configured."
             )
         self._model = resolved_model
-        self._evaluators = list(evaluators) if evaluators is not None else None
+        self._evaluators: list[str | GeneratedEvaluatorRef] | None = (
+            list(evaluators) if evaluators is not None else None
+        )
         self._conversation_split = conversation_split
         self._poll_interval = poll_interval
         self._timeout = timeout
@@ -678,7 +826,7 @@ async def evaluate(
     async def _evaluate_via_dataset(
         self,
         items: Sequence[EvalItem],
-        evaluators: list[str],
+        evaluators: list[str | GeneratedEvaluatorRef],
         eval_name: str,
     ) -> EvalResults:
         """Evaluate using JSONL dataset upload path."""
@@ -761,7 +909,7 @@ async def _evaluate_via_dataset(
 @experimental(feature_id=ExperimentalFeature.EVALS)
 async def evaluate_traces(
     *,
-    evaluators: Sequence[str] | None = None,
+    evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None,
     client: FoundryChatClient | None = None,
     project_client: AIProjectClient | None = None,
     model: str,
@@ -854,7 +1002,7 @@ async def evaluate_foundry_target(
     *,
     target: dict[str, Any],
     test_queries: Sequence[str],
-    evaluators: Sequence[str] | None = None,
+    evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None,
     client: FoundryChatClient | None = None,
     project_client: AIProjectClient | None = None,
     model: str,
@@ -870,7 +1018,9 @@ async def evaluate_foundry_target(
     Args:
         target: Target configuration dict.
         test_queries: Queries for Foundry to send to the target.
-        evaluators: Evaluator names.
+        evaluators: Evaluator names (built-in shorts / fully-qualified
+            ``builtin.*`` names) or :class:`GeneratedEvaluatorRef`
+            instances for generated rubric evaluators.
         client: A ``FoundryChatClient`` instance. Provide this or *project_client*.
         project_client: An ``AIProjectClient`` instance.
         model: Model deployment name for the evaluator LLM judge.
diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index a5d9f2e8642..f1ba5c86153 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -807,6 +807,79 @@ def test_all_tool_evaluators_include_tool_definitions(self) -> None:
             assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions"
 
 
+# ---------------------------------------------------------------------------
+# _build_item_schema
+# ---------------------------------------------------------------------------
+
+
+    def test_generated_evaluator_ref_pinned_version(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
+        criteria = _build_testing_criteria([ref], "gpt-4o", include_data_mapping=True)
+
+        assert len(criteria) == 1
+        c = criteria[0]
+        assert c["type"] == "azure_ai_evaluator"
+        assert c["evaluator_name"] == "my-rubric"
+        assert c["evaluator_version"] == "1"
+        assert c["name"] == "my-rubric"
+        assert c["initialization_parameters"] == {"deployment_name": "gpt-4o"}
+        assert c["data_mapping"] == {
+            "query": "{{item.query_messages}}",
+            "response": "{{item.response_messages}}",
+        }
+
+    def test_generated_evaluator_ref_display_name_used_as_short(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="2", display_name="My Rubric")
+        criteria = _build_testing_criteria([ref], "gpt-4o")
+
+        assert criteria[0]["name"] == "My Rubric"
+        assert criteria[0]["evaluator_name"] == "my-rubric"
+
+    def test_generated_evaluator_ref_tool_definitions_added(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
+        criteria = _build_testing_criteria(
+            [ref],
+            "gpt-4o",
+            include_data_mapping=True,
+            include_tool_definitions=True,
+        )
+
+        assert criteria[0]["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}"
+
+    def test_generated_evaluator_ref_unpinned_warns(self, caplog: pytest.LogCaptureFixture) -> None:
+        import logging
+
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef.latest("my-rubric")
+        with caplog.at_level(logging.WARNING, logger="agent_framework_foundry._foundry_evals"):
+            criteria = _build_testing_criteria([ref], "gpt-4o")
+
+        assert "evaluator_version" not in criteria[0]
+        assert any("no pinned version" in r.message for r in caplog.records)
+
+    def test_generated_evaluator_ref_mixed_with_builtins(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
+        criteria = _build_testing_criteria(
+            ["relevance", ref, "task_adherence"],
+            "gpt-4o",
+            include_data_mapping=True,
+        )
+
+        assert [c["name"] for c in criteria] == ["relevance", "my-rubric", "task_adherence"]
+        assert criteria[0]["evaluator_name"] == "builtin.relevance"
+        assert criteria[1]["evaluator_name"] == "my-rubric"
+        assert criteria[2]["evaluator_name"] == "builtin.task_adherence"
+
+
 # ---------------------------------------------------------------------------
 # _build_item_schema
 # ---------------------------------------------------------------------------
@@ -1264,6 +1337,37 @@ def test_raises_when_all_filtered(self) -> None:
             )
 
 
+# ---------------------------------------------------------------------------
+# EvalResults
+# ---------------------------------------------------------------------------
+
+
+    def test_preserves_generated_ref_when_no_tools(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="rubric", version="1")
+        items = [
+            EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]),
+        ]
+        result = _filter_tool_evaluators(
+            ["relevance", ref, "tool_call_accuracy"],
+            items,
+        )
+        assert "relevance" in result
+        assert ref in result
+        assert "tool_call_accuracy" not in result
+
+    def test_generated_ref_alone_does_not_raise(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="rubric", version="1")
+        items = [
+            EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]),
+        ]
+        result = _filter_tool_evaluators([ref], items)
+        assert result == [ref]
+
+
 # ---------------------------------------------------------------------------
 # EvalResults
 # ---------------------------------------------------------------------------
diff --git a/python/uv.lock b/python/uv.lock
index 58c0ed50ee5..dee89c9f0a0 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -604,7 +604,7 @@ dependencies = [
 [package.metadata]
 requires-dist = [
     { name = "agent-framework-core", editable = "packages/core" },
-    { name = "github-copilot-sdk", marker = "python_full_version >= '3.11'", specifier = "<=1.0.0b2,>=1.0.0b2" },
+    { name = "github-copilot-sdk", marker = "python_full_version >= '3.11'", specifier = ">=1.0.0b2,<=1.0.0b2" },
 ]
 
 [[package]]

From 4bc60462d91361c0c792c724afd8b3d20cb115c9 Mon Sep 17 00:00:00 2001
From: Ben Thomas <25218250+alliscode@users.noreply.github.com>
Date: Tue, 26 May 2026 17:31:48 -0700
Subject: [PATCH 03/16] Python: feat(evals): parse rubric_scores from output
 items + assertion helpers

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/agent_framework/_evaluation.py       | 145 ++++++++++++++++++
 .../core/tests/core/test_local_eval.py        | 101 ++++++++++++
 .../agent_framework_foundry/_foundry_evals.py |  82 +++++++++-
 .../foundry/tests/test_foundry_evals.py       | 100 ++++++++++++
 4 files changed, 426 insertions(+), 2 deletions(-)

diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py
index 32ae5bcfba4..9eb8c4393df 100644
--- a/python/packages/core/agent_framework/_evaluation.py
+++ b/python/packages/core/agent_framework/_evaluation.py
@@ -499,6 +499,151 @@ def raise_for_status(self, msg: str | None = None) -> None:
                     detail += f" Errored items: {', '.join(summaries)}."
             raise EvalNotPassedError(detail)
 
+    def assert_score_at_least(
+        self,
+        min_score: float,
+        *,
+        evaluator: str | None = None,
+        msg: str | None = None,
+    ) -> None:
+        """Assert every item's score (optionally filtered by evaluator) is ``>= min_score``.
+
+        Designed for CI gates on generated rubric evaluators (e.g.
+        ``results.assert_score_at_least(0.80)``).  Includes any
+        sub-results from workflow evaluations.
+
+        Args:
+            min_score: Minimum acceptable score (inclusive).
+            evaluator: When set, only check scores from the evaluator
+                whose ``EvalScoreResult.name`` matches.
+            msg: Optional custom failure message.
+
+        Raises:
+            EvalNotPassedError: When any matching score is below the threshold.
+        """
+        offenders: list[str] = []
+
+        def _check(results: EvalResults) -> None:
+            for item in results.items:
+                for score in item.scores:
+                    if evaluator is not None and score.name != evaluator:
+                        continue
+                    if score.score < min_score:
+                        offenders.append(f"{item.item_id}/{score.name}={score.score:.3f}")
+            for sub in results.sub_results.values():
+                _check(sub)
+
+        _check(self)
+        if offenders:
+            detail = msg or (
+                f"{len(offenders)} score(s) below threshold {min_score}"
+                f"{' for ' + evaluator if evaluator else ''}: {', '.join(offenders[:5])}"
+                + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "")
+            )
+            raise EvalNotPassedError(detail)
+
+    def assert_dimension_score_at_least(
+        self,
+        dimension_id: str,
+        min_score: float,
+        *,
+        evaluator: str | None = None,
+        require_applicable: bool = False,
+        msg: str | None = None,
+    ) -> None:
+        """Assert every item's score for a rubric *dimension* is ``>= min_score``.
+
+        Walks ``EvalScoreResult.dimensions`` looking for the named
+        dimension across all items (and sub-results).  Non-applicable
+        dimensions are skipped by default; pass
+        ``require_applicable=True`` to fail when no applicable score is
+        produced.
+
+        Args:
+            dimension_id: Dimension id (matches :attr:`RubricDimension.id`).
+            min_score: Minimum acceptable dimension score (inclusive).
+            evaluator: When set, only consider scores from the evaluator
+                whose ``EvalScoreResult.name`` matches.
+            require_applicable: When ``True``, missing or non-applicable
+                dimension scores raise.  Defaults to ``False`` (skip).
+            msg: Optional custom failure message.
+
+        Raises:
+            EvalNotPassedError: When the dimension fails the threshold.
+        """
+        offenders: list[str] = []
+        missing_items: list[str] = []
+
+        def _check(results: EvalResults) -> None:
+            for item in results.items:
+                found_applicable = False
+                found_any = False
+                for score in item.scores:
+                    if evaluator is not None and score.name != evaluator:
+                        continue
+                    if not score.dimensions:
+                        continue
+                    for rs in score.dimensions:
+                        if rs.id != dimension_id:
+                            continue
+                        found_any = True
+                        if not rs.applicable:
+                            continue
+                        found_applicable = True
+                        if rs.score is None or rs.score < min_score:
+                            offenders.append(
+                                f"{item.item_id}/{score.name}/{dimension_id}="
+                                f"{rs.score if rs.score is not None else 'None'}"
+                            )
+                if require_applicable and not found_applicable and (not evaluator or found_any):
+                    missing_items.append(item.item_id)
+            for sub in results.sub_results.values():
+                _check(sub)
+
+        _check(self)
+        problems: list[str] = []
+        if offenders:
+            problems.append(
+                f"{len(offenders)} dimension score(s) for '{dimension_id}' below {min_score}: "
+                f"{', '.join(offenders[:5])}" + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "")
+            )
+        if missing_items:
+            problems.append(
+                f"Dimension '{dimension_id}' not applicable on {len(missing_items)} item(s): "
+                f"{', '.join(missing_items[:5])}"
+            )
+        if problems:
+            raise EvalNotPassedError(msg or "; ".join(problems))
+
+    def assert_no_failed_items(self, msg: str | None = None) -> None:
+        """Assert no item ended in ``fail`` or ``error`` status.
+
+        Includes any sub-results from workflow evaluations.
+
+        Args:
+            msg: Optional custom failure message.
+
+        Raises:
+            EvalNotPassedError: When any item failed or errored.
+        """
+        bad: list[str] = []
+
+        def _check(results: EvalResults) -> None:
+            for item in results.items:
+                if item.is_failed or item.is_error:
+                    bad.append(f"{item.item_id}:{item.status}")
+            for sub in results.sub_results.values():
+                _check(sub)
+
+        _check(self)
+        if bad:
+            detail = msg or (
+                f"{len(bad)} item(s) failed or errored: {', '.join(bad[:5])}"
+                + (f" (+{len(bad) - 5} more)" if len(bad) > 5 else "")
+            )
+            raise EvalNotPassedError(detail)
+
+
 # endregion
 
 # region Generated rubric evaluators
diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py
index 96b0e1a3915..27c413b7151 100644
--- a/python/packages/core/tests/core/test_local_eval.py
+++ b/python/packages/core/tests/core/test_local_eval.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import inspect
+from typing import Any
 
 import pytest
 
@@ -1026,3 +1027,103 @@ def test_returns_none_for_empty_outputs(self):
         mock_result.get_outputs.return_value = []
         item = _build_overall_item("Hello", mock_result)
         assert item is None
+
+
+class TestRubricAssertions:
+    """Tests for EvalResults rubric assertion helpers."""
+
+    def _build_results(self, item_scores: list[list[tuple[str, float, list[Any] | None]]]) -> Any:
+        from agent_framework._evaluation import EvalItemResult, EvalResults, EvalScoreResult
+
+        items: list[EvalItemResult] = []
+        for i, scores in enumerate(item_scores):
+            items.append(
+                EvalItemResult(
+                    item_id=f"oi_{i}",
+                    status="pass",
+                    scores=[EvalScoreResult(name=name, score=score, dimensions=dims) for name, score, dims in scores],
+                )
+            )
+        return EvalResults(
+            provider="Local",
+            status="completed",
+            result_counts={"passed": len(items), "failed": 0, "errored": 0},
+            items=items,
+        )
+
+    def test_assert_score_at_least_passes(self):
+        results = self._build_results([[("relevance", 0.9, None)], [("relevance", 0.85, None)]])
+        results.assert_score_at_least(0.8)
+
+    def test_assert_score_at_least_raises(self):
+        from agent_framework._evaluation import EvalNotPassedError
+
+        results = self._build_results([[("relevance", 0.9, None)], [("relevance", 0.5, None)]])
+        with pytest.raises(EvalNotPassedError, match="below threshold"):
+            results.assert_score_at_least(0.8)
+
+    def test_assert_score_at_least_filtered_by_evaluator(self):
+        from agent_framework._evaluation import EvalNotPassedError
+
+        results = self._build_results([[("relevance", 0.9, None), ("coherence", 0.3, None)]])
+        # Coherence is low — only fails when not filtered out.
+        results.assert_score_at_least(0.8, evaluator="relevance")
+        with pytest.raises(EvalNotPassedError):
+            results.assert_score_at_least(0.8, evaluator="coherence")
+
+    def test_assert_dimension_score_at_least(self):
+        from agent_framework._evaluation import EvalNotPassedError, RubricScore
+
+        dims_pass = [
+            RubricScore(id="policy", score=4, applicable=True, weight=1, reason="ok"),
+            RubricScore(id="safety", score=5, applicable=True, weight=1, reason="ok"),
+        ]
+        dims_fail = [
+            RubricScore(id="policy", score=2, applicable=True, weight=1, reason="bad"),
+        ]
+        results = self._build_results([[("rubric", 0.9, dims_pass)], [("rubric", 0.5, dims_fail)]])
+        # Safety passes everywhere — no raise.
+        results.assert_dimension_score_at_least("safety", 4)
+        # Policy fails on the second item.
+        with pytest.raises(EvalNotPassedError, match="policy"):
+            results.assert_dimension_score_at_least("policy", 3)
+
+    def test_assert_dimension_skips_non_applicable_by_default(self):
+        from agent_framework._evaluation import RubricScore
+
+        dims = [
+            RubricScore(id="optional", score=None, applicable=False, weight=1, reason="n/a"),
+        ]
+        results = self._build_results([[("rubric", 0.9, dims)]])
+        # No applicable scores — should not raise.
+        results.assert_dimension_score_at_least("optional", 3)
+
+    def test_assert_dimension_require_applicable_raises(self):
+        from agent_framework._evaluation import EvalNotPassedError, RubricScore
+
+        dims = [
+            RubricScore(id="optional", score=None, applicable=False, weight=1, reason="n/a"),
+        ]
+        results = self._build_results([[("rubric", 0.9, dims)]])
+        with pytest.raises(EvalNotPassedError, match="not applicable"):
+            results.assert_dimension_score_at_least("optional", 3, require_applicable=True)
+
+    def test_assert_no_failed_items(self):
+        from agent_framework._evaluation import EvalItemResult, EvalNotPassedError, EvalResults
+
+        results = EvalResults(
+            provider="Local",
+            status="completed",
+            result_counts={"passed": 1, "failed": 1, "errored": 0},
+            items=[
+                EvalItemResult(item_id="oi_pass", status="pass"),
+                EvalItemResult(item_id="oi_fail", status="fail"),
+            ],
+        )
+        with pytest.raises(EvalNotPassedError, match="failed"):
+            results.assert_no_failed_items()
+
+
+# ---------------------------------------------------------------------------
+# r5 review: _build_overall_item with empty outputs
+# ---------------------------------------------------------------------------
diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index 9cfcc4bc678..b6e3530654c 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -30,7 +30,7 @@
 import logging
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 from agent_framework._evaluation import (
     AgentEvalConverter,
@@ -40,6 +40,7 @@
     EvalItemResult,
     EvalResults,
     EvalScoreResult,
+    RubricScore,
 )
 from agent_framework._feature_stage import ExperimentalFeature, experimental
 from openai import AsyncOpenAI
@@ -497,6 +498,80 @@ def _extract_per_evaluator(run: RunRetrieveResponse) -> dict[str, dict[str, int]
     return per_eval
 
 
+
+def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None:
+    """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload.
+
+    Foundry rubric evaluators include a per-dimension breakdown under
+    ``properties.rubric_scores`` on each result.  The exact location may
+    vary across SDK versions, so this helper accepts a few shapes:
+
+    * The SDK ``sample`` object exposes ``properties.rubric_scores``.
+    * The ``sample`` is a dict containing ``properties.rubric_scores``.
+    * The ``sample`` is a dict with ``rubric_scores`` at the top level.
+
+    Returns ``None`` when no rubric scores are present (i.e. the
+    evaluator was not a rubric evaluator).
+    """
+    if sample is None:
+        return None
+
+    raw: Any = None
+    properties: Any = getattr(sample, "properties", None)
+    if properties is not None:
+        raw = getattr(properties, "rubric_scores", None)
+        if raw is None and isinstance(properties, dict):
+            raw = cast("dict[str, Any]", properties).get("rubric_scores")
+    if raw is None and isinstance(sample, dict):
+        sample_any = cast("dict[str, Any]", sample)
+        props_dict: Any = sample_any.get("properties")
+        if isinstance(props_dict, dict):
+            raw = cast("dict[str, Any]", props_dict).get("rubric_scores")
+        if raw is None:
+            raw = sample_any.get("rubric_scores")
+
+    if not raw:
+        return None
+
+    parsed: list[RubricScore] = []
+    raw_iter: Any = raw
+    for raw_entry in raw_iter:
+        entry: Any = raw_entry
+        try:
+            rid: Any
+            score_val: Any
+            applicable: Any
+            weight: Any
+            reason: Any
+            if isinstance(entry, dict):
+                entry_any = cast("dict[str, Any]", entry)
+                rid = entry_any.get("id")
+                score_val = entry_any.get("score")
+                applicable = entry_any.get("applicable")
+                weight = entry_any.get("weight")
+                reason = entry_any.get("reason", "")
+            else:
+                rid = getattr(entry, "id", None)
+                score_val = getattr(entry, "score", None)
+                applicable = getattr(entry, "applicable", None)
+                weight = getattr(entry, "weight", None)
+                reason = getattr(entry, "reason", "") or ""
+            if rid is None or weight is None or applicable is None:
+                continue
+            parsed.append(
+                RubricScore(
+                    id=str(rid),
+                    score=int(score_val) if isinstance(score_val, (int, float)) else None,
+                    applicable=bool(applicable),
+                    weight=int(weight),
+                    reason=str(reason) if reason is not None else "",
+                )
+            )
+        except (TypeError, ValueError):
+            logger.debug("Skipping malformed rubric_scores entry: %s", cast("Any", entry), exc_info=True)
+    return parsed or None
+
+
 async def _fetch_output_items(
     client: AsyncOpenAI,
     eval_id: str,
@@ -520,12 +595,15 @@ async def _fetch_output_items(
             # Extract per-evaluator scores
             scores: list[EvalScoreResult] = []
             for r in oi.results or []:
+                sample = r.sample
+                dimensions = _extract_rubric_scores(sample)
                 scores.append(
                     EvalScoreResult(
                         name=r.name,
                         score=r.score,
                         passed=r.passed,
-                        sample=r.sample,
+                        sample=sample,
+                        dimensions=dimensions,
                     )
                 )
 
diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index f1ba5c86153..7502726d1ad 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -2474,6 +2474,106 @@ async def test_handles_api_failure_gracefully(self) -> None:
         assert items == []
 
 
+# ---------------------------------------------------------------------------
+# _poll_eval_run — timeout / failed / canceled paths
+# ---------------------------------------------------------------------------
+
+
+    async def test_extracts_rubric_scores_from_dict_sample(self) -> None:
+        from agent_framework_foundry._foundry_evals import _fetch_output_items
+
+        mock_result = MagicMock()
+        mock_result.name = "my-rubric"
+        mock_result.score = 0.85
+        mock_result.passed = True
+        mock_result.sample = {
+            "properties": {
+                "rubric_scores": [
+                    {"id": "policy", "score": 4, "applicable": True, "weight": 1, "reason": "ok"},
+                    {"id": "safety", "score": None, "applicable": False, "weight": 1, "reason": "n/a"},
+                ]
+            }
+        }
+
+        mock_oi = MagicMock()
+        mock_oi.id = "oi_1"
+        mock_oi.status = "pass"
+        mock_oi.results = [mock_result]
+        mock_oi.sample = None
+        mock_oi.datasource_item = {}
+
+        mock_client = MagicMock()
+        mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi]))
+
+        items = await _fetch_output_items(mock_client, "eval_1", "run_1")
+
+        assert len(items) == 1
+        scores = items[0].scores
+        assert len(scores) == 1
+        assert scores[0].dimensions is not None
+        assert len(scores[0].dimensions) == 2
+        policy = next(d for d in scores[0].dimensions if d.id == "policy")
+        assert policy.score == 4
+        assert policy.applicable is True
+        assert policy.weight == 1
+        assert policy.reason == "ok"
+        safety = next(d for d in scores[0].dimensions if d.id == "safety")
+        assert safety.score is None
+        assert safety.applicable is False
+
+class TestExtractRubricScores:
+    def test_handles_attribute_style_properties(self) -> None:
+        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
+
+        rs = MagicMock()
+        rs.id = "policy"
+        rs.score = 5
+        rs.applicable = True
+        rs.weight = 2
+        rs.reason = "ok"
+
+        sample = MagicMock()
+        sample.properties = MagicMock()
+        sample.properties.rubric_scores = [rs]
+
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert result[0].id == "policy"
+        assert result[0].score == 5
+        assert result[0].weight == 2
+
+    def test_top_level_rubric_scores_in_dict(self) -> None:
+        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
+
+        sample = {"rubric_scores": [{"id": "a", "score": 3, "applicable": True, "weight": 1, "reason": "r"}]}
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert result[0].id == "a"
+
+    def test_returns_none_when_missing(self) -> None:
+        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
+
+        assert _extract_rubric_scores(None) is None
+        assert _extract_rubric_scores({}) is None
+        assert _extract_rubric_scores({"properties": {}}) is None
+
+    def test_skips_malformed_entries(self) -> None:
+        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
+
+        sample = {
+            "properties": {
+                "rubric_scores": [
+                    {"id": "good", "score": 3, "applicable": True, "weight": 1, "reason": "ok"},
+                    {"id": "bad-no-weight", "score": 2, "applicable": True, "reason": "x"},
+                ]
+            }
+        }
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert len(result) == 1
+        assert result[0].id == "good"
+
+
 # ---------------------------------------------------------------------------
 # _poll_eval_run — timeout / failed / canceled paths
 # ---------------------------------------------------------------------------

From 38d51d13b14577a1ff671b4e14cfdeed4f03aefb Mon Sep 17 00:00:00 2001
From: Ben Thomas <25218250+alliscode@users.noreply.github.com>
Date: Tue, 26 May 2026 17:34:04 -0700
Subject: [PATCH 04/16] Python: feat(evals): BaseAgent.as_eval_source /
 Workflow.as_eval_source

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../packages/core/agent_framework/_agents.py  |  43 +++
 .../core/agent_framework/_evaluation.py       | 151 +++++++++-
 .../agent_framework/_workflows/_workflow.py   |  49 +++
 .../core/tests/core/test_local_eval.py        | 284 +++++++++++-------
 4 files changed, 416 insertions(+), 111 deletions(-)

diff --git a/python/packages/core/agent_framework/_agents.py b/python/packages/core/agent_framework/_agents.py
index 585898ae523..65506cadc6f 100644
--- a/python/packages/core/agent_framework/_agents.py
+++ b/python/packages/core/agent_framework/_agents.py
@@ -444,6 +444,49 @@ def get_session(self, service_session_id: str, *, session_id: str | None = None)
         """
         return AgentSession(session_id=session_id, service_session_id=service_session_id)
 
+    def as_eval_source(
+        self,
+        *,
+        include_instructions: bool = True,
+        include_tools: bool = True,
+        include_context_providers: bool = False,
+        include_examples: bool = False,
+        examples: Sequence[str] | None = None,
+    ) -> str:
+        """Render this agent as a textual dossier for rubric-evaluator generation.
+
+        Packages the agent's name, description, instructions, tool
+        definitions, and optional context-provider class names into a
+        single plain-text dossier suitable for passing to a rubric
+        generation pipeline (e.g. ``FoundryEvals.generate_rubric``).
+
+        Defaults are conservative: instructions and tools are included;
+        examples and context-provider class names are not.
+
+        Keyword Args:
+            include_instructions: Whether to include the agent's
+                instructions text.
+            include_tools: Whether to include tool definitions.
+            include_context_providers: Whether to include attached
+                context-provider class names.
+            include_examples: Whether to include the supplied ``examples``.
+            examples: Sample queries / interactions to include when
+                ``include_examples`` is true.
+
+        Returns:
+            A plain-text dossier describing the agent.
+        """
+        from ._evaluation import _render_agent_dossier  # pyright: ignore[reportPrivateUsage]
+
+        return _render_agent_dossier(
+            self,
+            include_instructions=include_instructions,
+            include_tools=include_tools,
+            include_context_providers=include_context_providers,
+            include_examples=include_examples,
+            examples=examples,
+        )
+
     async def _run_after_providers(
         self,
         *,
diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py
index 9eb8c4393df..48704d3543c 100644
--- a/python/packages/core/agent_framework/_evaluation.py
+++ b/python/packages/core/agent_framework/_evaluation.py
@@ -311,8 +311,8 @@ class EvalScoreResult:
         score: Numeric score from the evaluator.
         passed: Whether the item passed this evaluator's threshold.
         sample: Optional raw evaluator output (rationale, metadata).
-        dimensions: Per-dimension scores for rubric-based evaluators.
-            ``None`` for non-rubric (e.g. built-in) evaluators.
+        dimensions: Per-dimension scores when this evaluator is a rubric
+            evaluator.  ``None`` for non-rubric (e.g. built-in) evaluators.
     """
 
     name: str
@@ -560,7 +560,7 @@ def assert_dimension_score_at_least(
         produced.
 
         Args:
-            dimension_id: Dimension id (matches :attr:`RubricDimension.id`).
+            dimension_id: Dimension id (matches the rubric definition).
             min_score: Minimum acceptable dimension score (inclusive).
             evaluator: When set, only consider scores from the evaluator
                 whose ``EvalScoreResult.name`` matches.
@@ -654,14 +654,13 @@ def _check(results: EvalResults) -> None:
 class RubricScore:
     """A single dimension's score from a rubric-based evaluator run.
 
-    Rubric evaluators (e.g. Foundry's generated rubric evaluators) emit
-    one ``RubricScore`` per dimension per item.  Attached to
-    :class:`EvalScoreResult` as a typed view of the raw
-    ``properties.rubric_scores`` payload.
+    Rubric evaluators emit one ``RubricScore`` per dimension per item.
+    Attached to :class:`EvalScoreResult` as a typed view of the raw
+    ``properties.rubric_scores`` payload returned by providers such as
+    Foundry's generated rubric evaluators.
 
     Attributes:
-        id: Stable identifier for the dimension (e.g.
-            ``"policy_enforcement"``) defined by the rubric.
+        id: Dimension id (matches the rubric definition).
         score: Numeric score, or ``None`` when the dimension was marked
             non-applicable for this item.
         applicable: Whether the dimension applied to this item.
@@ -676,6 +675,140 @@ class RubricScore:
     reason: str
 
 
+# endregion
+
+# region Eval source rendering
+
+
+def _render_agent_dossier(
+    agent: Any,
+    *,
+    include_instructions: bool,
+    include_tools: bool,
+    include_context_providers: bool,
+    include_examples: bool,
+    examples: Sequence[str] | None,
+) -> str:
+    """Render a structured, plain-text dossier of an agent for rubric generation."""
+    lines: list[str] = []
+    name = getattr(agent, "name", None) or "<unnamed agent>"
+    description = getattr(agent, "description", None)
+    lines.append(f"Agent name: {name}")
+    if description:
+        lines.append(f"Description: {description}")
+
+    if include_instructions:
+        instructions: str | None = None
+        default_options: Any = getattr(agent, "default_options", None)
+        if isinstance(default_options, dict):
+            raw_instr: Any = cast("dict[str, Any]", default_options).get("instructions")
+            if isinstance(raw_instr, str) and raw_instr.strip():
+                instructions = raw_instr
+        if instructions is None:
+            raw_instr = getattr(agent, "instructions", None)
+            if isinstance(raw_instr, str) and raw_instr.strip():
+                instructions = raw_instr
+        if instructions:
+            lines.append("")
+            lines.append("Instructions:")
+            lines.append(instructions.strip())
+
+    if include_tools:
+        tool_defs = AgentEvalConverter.extract_tools(agent)
+        if tool_defs:
+            lines.append("")
+            lines.append("Tools:")
+            for tool in tool_defs:
+                tool_line = f"- {tool['name']}"
+                tool_desc = tool.get("description")
+                if tool_desc:
+                    tool_line += f": {tool_desc}"
+                lines.append(tool_line)
+                params = tool.get("parameters")
+                if params:
+                    try:
+                        params_json = json.dumps(params, sort_keys=True)
+                    except (TypeError, ValueError):
+                        params_json = str(params)
+                    lines.append(f"  parameters: {params_json}")
+
+    if include_context_providers:
+        providers = getattr(agent, "context_providers", None)
+        if providers:
+            lines.append("")
+            lines.append("Context providers:")
+            for provider in providers:
+                lines.append(f"- {type(provider).__name__}")
+
+    if include_examples and examples:
+        lines.append("")
+        lines.append("Examples:")
+        for idx, example in enumerate(examples, start=1):
+            lines.append(f"{idx}. {example}")
+
+    return "\n".join(lines).strip()
+
+
+def _render_workflow_dossier(  # pyright: ignore[reportUnusedFunction]
+    workflow: Workflow,
+    *,
+    include_instructions: bool,
+    include_tools: bool,
+    include_context_providers: bool,
+    include_examples: bool,
+    examples: Sequence[str] | None,
+    include_topology: bool,
+) -> str:
+    """Render a structured, plain-text dossier of a workflow for rubric generation."""
+    from ._workflows._agent_executor import AgentExecutor as _AE
+
+    lines: list[str] = []
+    name = workflow.name or "<unnamed workflow>"
+    lines.append(f"Workflow name: {name}")
+    if workflow.description:
+        lines.append(f"Description: {workflow.description}")
+
+    if include_topology:
+        try:
+            topology = json.dumps(workflow.to_dict(), sort_keys=True, default=str)
+        except (TypeError, ValueError) as exc:
+            logger.debug("Workflow.to_dict() failed during eval source export: %s", exc)
+            topology = None
+        if topology:
+            lines.append("")
+            lines.append("Topology (JSON):")
+            lines.append(topology)
+
+    agent_executors: list[tuple[str, Any]] = []
+    for executor_id, executor in workflow.executors.items():
+        if isinstance(executor, _AE):
+            agent_executors.append((executor_id, executor.agent))
+
+    if agent_executors:
+        lines.append("")
+        lines.append("Agents:")
+        for executor_id, agent in agent_executors:
+            lines.append("")
+            lines.append(f"Executor: {executor_id}")
+            dossier = _render_agent_dossier(
+                agent,
+                include_instructions=include_instructions,
+                include_tools=include_tools,
+                include_context_providers=include_context_providers,
+                include_examples=False,
+                examples=None,
+            )
+            lines.append(dossier)
+
+    if include_examples and examples:
+        lines.append("")
+        lines.append("Examples:")
+        for idx, example in enumerate(examples, start=1):
+            lines.append(f"{idx}. {example}")
+
+    return "\n".join(lines).strip()
+
+
 # endregion
 
 # region Evaluator protocol
diff --git a/python/packages/core/agent_framework/_workflows/_workflow.py b/python/packages/core/agent_framework/_workflows/_workflow.py
index 0493cd015f3..bce7569ef1a 100644
--- a/python/packages/core/agent_framework/_workflows/_workflow.py
+++ b/python/packages/core/agent_framework/_workflows/_workflow.py
@@ -410,6 +410,55 @@ def to_json(self) -> str:
         """Serialize the workflow definition to JSON."""
         return json.dumps(self.to_dict())
 
+    def as_eval_source(
+        self,
+        *,
+        include_instructions: bool = True,
+        include_tools: bool = True,
+        include_context_providers: bool = False,
+        include_examples: bool = False,
+        examples: Sequence[str] | None = None,
+        include_topology: bool = True,
+    ) -> str:
+        """Render this workflow as a textual dossier for rubric-evaluator generation.
+
+        Produces a plain-text dossier containing the workflow's name,
+        description, optional JSON-encoded topology (from
+        :meth:`Workflow.to_dict`), and per-agent dossiers extracted from
+        ``AgentExecutor`` nodes.  Suitable for passing to a rubric
+        generation pipeline (e.g. ``FoundryEvals.generate_rubric``).
+
+        Defaults are conservative: per-agent instructions and tools are
+        included, plus the JSON-encoded topology.  Examples and
+        context-provider class names are excluded by default.
+
+        Keyword Args:
+            include_instructions: Per-agent instructions inclusion.
+            include_tools: Per-agent tool-definition inclusion.
+            include_context_providers: Per-agent context-provider
+                inclusion.
+            include_examples: Whether to include workflow-level
+                ``examples``.
+            examples: Sample queries / interactions to include when
+                ``include_examples`` is true.
+            include_topology: Whether to embed the JSON-encoded workflow
+                topology in the rendered dossier.
+
+        Returns:
+            A plain-text dossier describing the workflow.
+        """
+        from .._evaluation import _render_workflow_dossier  # pyright: ignore[reportPrivateUsage]
+
+        return _render_workflow_dossier(
+            self,
+            include_instructions=include_instructions,
+            include_tools=include_tools,
+            include_context_providers=include_context_providers,
+            include_examples=include_examples,
+            examples=examples,
+            include_topology=include_topology,
+        )
+
     def get_start_executor(self) -> Executor:
         """Get the starting executor of the workflow.
 
diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py
index 27c413b7151..c13b107c4bd 100644
--- a/python/packages/core/tests/core/test_local_eval.py
+++ b/python/packages/core/tests/core/test_local_eval.py
@@ -1011,119 +1011,199 @@ def test_all_passed_parent_fails_when_own_counts_fail(self):
 
 
 # ---------------------------------------------------------------------------
-# r5 review: _build_overall_item with empty outputs
+# Eval source rendering (string dossiers)
 # ---------------------------------------------------------------------------
 
 
-class TestBuildOverallItemEmpty:
-    """Test _build_overall_item returns None for empty workflow outputs."""
+class TestAgentAsEvalSource:
+    """Tests for BaseAgent.as_eval_source / _render_agent_dossier."""
 
-    def test_returns_none_for_empty_outputs(self):
+    def _make_mock_agent(
+        self,
+        *,
+        name: str = "weather-bot",
+        description: str | None = "Looks up the weather.",
+        instructions: str | None = "Be concise.  Always cite the source.",
+        tools: list[Any] | None = None,
+        context_providers: list[Any] | None = None,
+        mcp_tools: list[Any] | None = None,
+    ) -> Any:
         from unittest.mock import MagicMock
 
-        from agent_framework._evaluation import _build_overall_item
-
-        mock_result = MagicMock()
-        mock_result.get_outputs.return_value = []
-        item = _build_overall_item("Hello", mock_result)
-        assert item is None
-
-
-class TestRubricAssertions:
-    """Tests for EvalResults rubric assertion helpers."""
-
-    def _build_results(self, item_scores: list[list[tuple[str, float, list[Any] | None]]]) -> Any:
-        from agent_framework._evaluation import EvalItemResult, EvalResults, EvalScoreResult
-
-        items: list[EvalItemResult] = []
-        for i, scores in enumerate(item_scores):
-            items.append(
-                EvalItemResult(
-                    item_id=f"oi_{i}",
-                    status="pass",
-                    scores=[EvalScoreResult(name=name, score=score, dimensions=dims) for name, score, dims in scores],
-                )
-            )
-        return EvalResults(
-            provider="Local",
-            status="completed",
-            result_counts={"passed": len(items), "failed": 0, "errored": 0},
-            items=items,
+        from agent_framework._tools import ai_function
+
+        agent = MagicMock()
+        agent.name = name
+        agent.description = description
+        agent.default_options = {"instructions": instructions, "tools": tools or []}
+        agent.context_providers = context_providers or []
+        agent.mcp_tools = mcp_tools or []
+        if tools:
+            normalized: list[Any] = []
+            for t in tools:
+                if callable(t) and not hasattr(t, "parameters"):
+                    normalized.append(ai_function(t))
+                else:
+                    normalized.append(t)
+            agent.default_options["tools"] = normalized
+        return agent
+
+    def _render(self, agent: Any, **overrides: Any) -> str:
+        from agent_framework._evaluation import _render_agent_dossier
+
+        kwargs: dict[str, Any] = {
+            "include_instructions": True,
+            "include_tools": True,
+            "include_context_providers": False,
+            "include_examples": False,
+            "examples": None,
+        }
+        kwargs.update(overrides)
+        return _render_agent_dossier(agent, **kwargs)
+
+    def test_basic_dossier_includes_name_and_instructions(self):
+        agent = self._make_mock_agent()
+        dossier = self._render(agent)
+        assert isinstance(dossier, str)
+        assert "Agent name: weather-bot" in dossier
+        assert "Description: Looks up the weather." in dossier
+        assert "Instructions:" in dossier
+        assert "Be concise." in dossier
+
+    def test_tools_section_includes_definitions(self):
+        def get_weather(city: str) -> str:
+            """Return the current weather for *city*."""
+            return f"sunny in {city}"
+
+        agent = self._make_mock_agent(tools=[get_weather])
+        dossier = self._render(agent)
+        assert "Tools:" in dossier
+        assert "- get_weather" in dossier
+        assert '"city"' in dossier
+
+    def test_include_instructions_false_omits_section(self):
+        agent = self._make_mock_agent()
+        dossier = self._render(agent, include_instructions=False)
+        assert "Instructions:" not in dossier
+
+    def test_include_tools_false_omits_section(self):
+        def get_weather(city: str) -> str:
+            return f"sunny in {city}"
+
+        agent = self._make_mock_agent(tools=[get_weather])
+        dossier = self._render(agent, include_tools=False)
+        assert "Tools:" not in dossier
+
+    def test_context_providers_excluded_by_default_but_included_when_opted_in(self):
+        class StubProvider:
+            pass
+
+        agent = self._make_mock_agent(context_providers=[StubProvider()])
+        default_dossier = self._render(agent)
+        assert "Context providers:" not in default_dossier
+
+        opt_in_dossier = self._render(agent, include_context_providers=True)
+        assert "Context providers:" in opt_in_dossier
+        assert "- StubProvider" in opt_in_dossier
+
+    def test_examples_excluded_by_default_but_included_when_opted_in(self):
+        agent = self._make_mock_agent()
+        default_dossier = self._render(agent, examples=["What's the weather in NYC?"])
+        assert "Examples:" not in default_dossier
+
+        opt_in_dossier = self._render(
+            agent,
+            include_examples=True,
+            examples=["What's the weather in NYC?"],
         )
+        assert "Examples:" in opt_in_dossier
+        assert "What's the weather in NYC?" in opt_in_dossier
 
-    def test_assert_score_at_least_passes(self):
-        results = self._build_results([[("relevance", 0.9, None)], [("relevance", 0.85, None)]])
-        results.assert_score_at_least(0.8)
-
-    def test_assert_score_at_least_raises(self):
-        from agent_framework._evaluation import EvalNotPassedError
-
-        results = self._build_results([[("relevance", 0.9, None)], [("relevance", 0.5, None)]])
-        with pytest.raises(EvalNotPassedError, match="below threshold"):
-            results.assert_score_at_least(0.8)
-
-    def test_assert_score_at_least_filtered_by_evaluator(self):
-        from agent_framework._evaluation import EvalNotPassedError
-
-        results = self._build_results([[("relevance", 0.9, None), ("coherence", 0.3, None)]])
-        # Coherence is low — only fails when not filtered out.
-        results.assert_score_at_least(0.8, evaluator="relevance")
-        with pytest.raises(EvalNotPassedError):
-            results.assert_score_at_least(0.8, evaluator="coherence")
-
-    def test_assert_dimension_score_at_least(self):
-        from agent_framework._evaluation import EvalNotPassedError, RubricScore
+    def test_base_agent_method_returns_dossier_string(self):
+        from agent_framework._agents import BaseAgent
 
-        dims_pass = [
-            RubricScore(id="policy", score=4, applicable=True, weight=1, reason="ok"),
-            RubricScore(id="safety", score=5, applicable=True, weight=1, reason="ok"),
-        ]
-        dims_fail = [
-            RubricScore(id="policy", score=2, applicable=True, weight=1, reason="bad"),
-        ]
-        results = self._build_results([[("rubric", 0.9, dims_pass)], [("rubric", 0.5, dims_fail)]])
-        # Safety passes everywhere — no raise.
-        results.assert_dimension_score_at_least("safety", 4)
-        # Policy fails on the second item.
-        with pytest.raises(EvalNotPassedError, match="policy"):
-            results.assert_dimension_score_at_least("policy", 3)
-
-    def test_assert_dimension_skips_non_applicable_by_default(self):
-        from agent_framework._evaluation import RubricScore
-
-        dims = [
-            RubricScore(id="optional", score=None, applicable=False, weight=1, reason="n/a"),
-        ]
-        results = self._build_results([[("rubric", 0.9, dims)]])
-        # No applicable scores — should not raise.
-        results.assert_dimension_score_at_least("optional", 3)
+        class _ConcreteAgent(BaseAgent):
+            pass
 
-    def test_assert_dimension_require_applicable_raises(self):
-        from agent_framework._evaluation import EvalNotPassedError, RubricScore
+        agent = _ConcreteAgent(name="test-agent", description="A test agent.")
+        dossier = agent.as_eval_source()
+        assert isinstance(dossier, str)
+        assert "Agent name: test-agent" in dossier
 
-        dims = [
-            RubricScore(id="optional", score=None, applicable=False, weight=1, reason="n/a"),
-        ]
-        results = self._build_results([[("rubric", 0.9, dims)]])
-        with pytest.raises(EvalNotPassedError, match="not applicable"):
-            results.assert_dimension_score_at_least("optional", 3, require_applicable=True)
 
-    def test_assert_no_failed_items(self):
-        from agent_framework._evaluation import EvalItemResult, EvalNotPassedError, EvalResults
-
-        results = EvalResults(
-            provider="Local",
-            status="completed",
-            result_counts={"passed": 1, "failed": 1, "errored": 0},
-            items=[
-                EvalItemResult(item_id="oi_pass", status="pass"),
-                EvalItemResult(item_id="oi_fail", status="fail"),
-            ],
-        )
-        with pytest.raises(EvalNotPassedError, match="failed"):
-            results.assert_no_failed_items()
+class TestWorkflowAsEvalSource:
+    """Tests for Workflow.as_eval_source / _render_workflow_dossier."""
 
+    def _build_workflow(self, *, with_agent: bool = False) -> Any:
+        from unittest.mock import MagicMock
 
-# ---------------------------------------------------------------------------
-# r5 review: _build_overall_item with empty outputs
-# ---------------------------------------------------------------------------
+        from agent_framework._workflows._agent_executor import AgentExecutor
+
+        workflow = MagicMock()
+        workflow.name = "demo-workflow"
+        workflow.description = "Routes user questions through a single agent."
+        workflow.to_dict.return_value = {
+            "name": "demo-workflow",
+            "id": "wf_1",
+            "start_executor_id": "agent_1",
+            "edge_groups": [],
+            "executors": {"agent_1": {"type": "AgentExecutor"}},
+        }
+
+        if with_agent:
+            inner_agent = MagicMock()
+            inner_agent.name = "inner-agent"
+            inner_agent.description = "Inner agent."
+            inner_agent.default_options = {"instructions": "Answer politely.", "tools": []}
+            inner_agent.context_providers = []
+            inner_agent.mcp_tools = []
+
+            executor = MagicMock(spec=AgentExecutor)
+            executor.agent = inner_agent
+            workflow.executors = {"agent_1": executor}
+        else:
+            workflow.executors = {}
+        return workflow
+
+    def _render(self, workflow: Any, **overrides: Any) -> str:
+        from agent_framework._evaluation import _render_workflow_dossier
+
+        kwargs: dict[str, Any] = {
+            "include_instructions": True,
+            "include_tools": True,
+            "include_context_providers": False,
+            "include_examples": False,
+            "examples": None,
+            "include_topology": True,
+        }
+        kwargs.update(overrides)
+        return _render_workflow_dossier(workflow, **kwargs)
+
+    def test_emits_dossier_with_topology(self):
+        workflow = self._build_workflow()
+        dossier = self._render(workflow)
+        assert isinstance(dossier, str)
+        assert "Workflow name: demo-workflow" in dossier
+        assert "Topology (JSON):" in dossier
+        assert '"start_executor_id": "agent_1"' in dossier
+
+    def test_topology_can_be_disabled(self):
+        workflow = self._build_workflow()
+        dossier = self._render(workflow, include_topology=False)
+        assert "Topology (JSON):" not in dossier
+
+    def test_per_agent_dossiers_included_when_executor_is_agent_executor(self):
+        workflow = self._build_workflow(with_agent=True)
+        dossier = self._render(workflow)
+        assert "Agents:" in dossier
+        assert "Executor: agent_1" in dossier
+        assert "Agent name: inner-agent" in dossier
+        assert "Answer politely." in dossier
+
+    def test_workflow_examples_excluded_by_default(self):
+        workflow = self._build_workflow()
+        default_dossier = self._render(workflow, examples=["Hi"])
+        assert "Examples:" not in default_dossier
+
+        opt_in_dossier = self._render(workflow, examples=["Hi"], include_examples=True)
+        assert "Examples:" in opt_in_dossier

From a9e46765ea5e94877c8525e4a62db13396c4ae7c Mon Sep 17 00:00:00 2001
From: Ben Thomas <25218250+alliscode@users.noreply.github.com>
Date: Wed, 27 May 2026 08:21:06 -0700
Subject: [PATCH 05/16] Python: feat(foundry-evals): EvalGenerationSource +
 generate_rubric helper

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../agent_framework_foundry/__init__.py       |   6 +
 .../agent_framework_foundry/_foundry_evals.py | 498 +++++++++++++++++-
 .../foundry/tests/test_foundry_evals.py       | 459 +++++++++++++++-
 3 files changed, 936 insertions(+), 27 deletions(-)

diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py
index 14eebfaffa0..cafe30eb955 100644
--- a/python/packages/foundry/agent_framework_foundry/__init__.py
+++ b/python/packages/foundry/agent_framework_foundry/__init__.py
@@ -11,11 +11,14 @@
     RawFoundryEmbeddingClient,
 )
 from ._foundry_evals import (
+    EvalGenerationSource,
     FoundryEvals,
     GeneratedEvaluatorRef,
     RubricDimension,
+    agent_as_eval_source,
     evaluate_foundry_target,
     evaluate_traces,
+    workflow_as_eval_source,
 )
 from ._memory_provider import FoundryMemoryProvider
 
@@ -25,6 +28,7 @@
     __version__ = "0.0.0"
 
 __all__ = [
+    "EvalGenerationSource",
     "FoundryAgent",
     "FoundryAgentOptions",
     "FoundryChatClient",
@@ -41,6 +45,8 @@
     "RawFoundryEmbeddingClient",
     "RubricDimension",
     "__version__",
+    "agent_as_eval_source",
     "evaluate_foundry_target",
     "evaluate_traces",
+    "workflow_as_eval_source",
 ]
diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index b6e3530654c..0d83d8b1bc3 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -48,6 +48,8 @@
 from ._chat_client import FoundryChatClient
 
 if TYPE_CHECKING:
+    from agent_framework._agents import BaseAgent
+    from agent_framework._workflows._workflow import Workflow
     from azure.ai.projects.aio import AIProjectClient
     from openai.types.evals import RunRetrieveResponse
 
@@ -60,12 +62,12 @@
 @experimental(feature_id=ExperimentalFeature.EVALS)
 @dataclass(frozen=True)
 class RubricDimension:
-    """A single dimension of a Foundry generated rubric evaluator.
+    """A single dimension of a generated rubric evaluator.
 
     Rubric evaluators score each item along one or more named dimensions,
     each with its own description and weight.  Foundry's evaluator
     generation pipeline produces these dimensions from agent/workflow
-    metadata; agent-framework surfaces them so callers can inspect a
+    metadata; ``RubricDimension`` surfaces them so callers can inspect a
     generated evaluator's structure without round-tripping through the
     portal.
 
@@ -152,8 +154,164 @@ def latest(
         )
 
 
-# endregion
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class EvalGenerationSource:
+    """A source description passed to Foundry's evaluator generation pipeline.
+
+    Rubric evaluator generation consumes one or more sources that describe
+    the agent or workflow under evaluation.  ``FoundryEvals`` translates
+    instances into the underlying ``*EvaluatorGenerationJobSource`` SDK
+    types.
+
+    Discriminated by :attr:`type`:
+
+    * ``"prompt"`` - a free-form textual dossier (typical for local agents
+      and workflows whose tools cannot be fetched server-side).
+    * ``"agent"`` - a hosted Foundry agent referenced by name so the
+      service fetches tool definitions and metadata directly.
+    * ``"dataset"`` - a Foundry dataset of recorded interactions.
+    * ``"traces"`` - tracing data scoped by metadata.
+
+    Only the fields relevant to :attr:`type` are populated; the remaining
+    fields stay ``None``.
+
+    Attributes:
+        type: Source kind.  See discriminator above.
+        description: Optional short description shown in Foundry UI.
+        prompt: Rendered dossier for ``type="prompt"`` sources.
+        agent_name: Hosted Foundry agent name for ``type="agent"`` sources.
+        dataset_name: Foundry dataset name for ``type="dataset"`` sources.
+        dataset_version: Pinned dataset version (recommended for repro).
+        metadata: Free-form metadata.  Used by ``type="traces"`` sources
+            for tracing-attribute filters and as a generic escape hatch
+            for additional fields not yet modeled.
+    """
+
+    type: Literal["prompt", "dataset", "agent", "traces"]
+    description: str | None = None
+    prompt: str | None = None
+    agent_name: str | None = None
+    dataset_name: str | None = None
+    dataset_version: str | None = None
+    metadata: dict[str, Any] | None = None
 
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+def agent_as_eval_source(
+    agent: BaseAgent,
+    *,
+    include_instructions: bool = True,
+    include_tools: bool = True,
+    include_context_providers: bool = False,
+    include_examples: bool = False,
+    examples: Sequence[str] | None = None,
+    hosted_agent_name: str | None = None,
+) -> EvalGenerationSource:
+    """Render an agent as an :class:`EvalGenerationSource` for rubric generation.
+
+    Wraps :meth:`BaseAgent.as_eval_source` to package the agent's
+    rendered dossier into a typed Foundry generation source.  When
+    ``hosted_agent_name`` is provided, returns a ``type="agent"`` source
+    referencing the hosted Foundry agent so the service fetches
+    server-side metadata directly instead of using a rendered dossier.
+
+    Args:
+        agent: Agent instance (typically a ``BaseAgent`` subclass).
+        include_instructions: Whether to include the agent's instructions
+            text.  Defaults to ``True``.
+        include_tools: Whether to include tool definitions.  Defaults to
+            ``True``.
+        include_context_providers: Whether to include the names of
+            attached context-provider classes.  Defaults to ``False`` to
+            avoid leaking implementation details.
+        include_examples: Whether to include the supplied ``examples``.
+            Defaults to ``False`` to avoid shipping potentially sensitive
+            sample inputs by default.
+        examples: Optional sample queries / interactions to include when
+            ``include_examples`` is ``True``.
+        hosted_agent_name: When set, emit a ``type="agent"`` source
+            referencing the hosted Foundry agent by name instead of a
+            rendered dossier.
+
+    Returns:
+        An :class:`EvalGenerationSource` describing the agent.
+    """
+    if hosted_agent_name:
+        agent_description = getattr(agent, "description", None)
+        return EvalGenerationSource(
+            type="agent",
+            agent_name=hosted_agent_name,
+            description=agent_description,
+        )
+
+    prompt = agent.as_eval_source(
+        include_instructions=include_instructions,
+        include_tools=include_tools,
+        include_context_providers=include_context_providers,
+        include_examples=include_examples,
+        examples=examples,
+    )
+    agent_description = getattr(agent, "description", None)
+    return EvalGenerationSource(
+        type="prompt",
+        prompt=prompt,
+        description=agent_description,
+    )
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+def workflow_as_eval_source(
+    workflow: Workflow,
+    *,
+    include_instructions: bool = True,
+    include_tools: bool = True,
+    include_context_providers: bool = False,
+    include_examples: bool = False,
+    examples: Sequence[str] | None = None,
+    include_topology: bool = True,
+) -> EvalGenerationSource:
+    """Render a workflow as an :class:`EvalGenerationSource` for rubric generation.
+
+    Wraps :meth:`Workflow.as_eval_source` to package the workflow's
+    rendered dossier (workflow name, description, topology, per-agent
+    dossiers) into a typed ``type="prompt"`` Foundry generation source.
+
+    Args:
+        workflow: Workflow instance to render.
+        include_instructions: Per-agent instructions inclusion.
+        include_tools: Per-agent tools inclusion.
+        include_context_providers: Per-agent context-provider inclusion.
+            Defaults to ``False``.
+        include_examples: Per-agent examples inclusion.  Defaults to
+            ``False``.
+        examples: Optional workflow-level sample queries.  Rendered into
+            a top-level ``Examples:`` section when ``include_examples`` is
+            ``True``.
+        include_topology: Whether to embed the JSON-encoded workflow
+            topology produced by :meth:`Workflow.to_dict`.  Defaults to
+            ``True``.
+
+    Returns:
+        A ``type="prompt"`` :class:`EvalGenerationSource` describing the
+        workflow.
+    """
+    prompt = workflow.as_eval_source(
+        include_instructions=include_instructions,
+        include_tools=include_tools,
+        include_context_providers=include_context_providers,
+        include_examples=include_examples,
+        examples=examples,
+        include_topology=include_topology,
+    )
+    return EvalGenerationSource(
+        type="prompt",
+        prompt=prompt,
+        description=workflow.description,
+    )
+
+
+# endregion
 # Agent evaluators that accept query/response as conversation arrays.
 # Maintained manually — check https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk
 # for the latest evaluator list. These are the evaluators that need conversation-format input.
@@ -307,6 +465,8 @@ def _build_testing_criteria(
                     entry_spec.name,
                 )
             if include_data_mapping:
+                # Rubric evaluators accept conversation arrays like agent
+                # evaluators, plus tool_definitions when items are tool-aware.
                 ref_mapping: dict[str, str] = {
                     "query": "{{item.query_messages}}",
                     "response": "{{item.response_messages}}",
@@ -498,7 +658,6 @@ def _extract_per_evaluator(run: RunRetrieveResponse) -> dict[str, dict[str, int]
     return per_eval
 
 
-
 def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None:
     """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload.
 
@@ -978,6 +1137,329 @@ async def _evaluate_via_dataset(
             provider=self.name,
         )
 
+    @classmethod
+    @experimental(feature_id=ExperimentalFeature.EVALS)
+    async def generate_rubric(
+        cls,
+        *,
+        project_client: AIProjectClient,
+        name: str,
+        agent: BaseAgent | None = None,
+        workflow: Workflow | None = None,
+        sources: Sequence[EvalGenerationSource] | None = None,
+        category: str = "quality",
+        model: str | None = None,
+        display_name: str | None = None,
+        description: str | None = None,
+        operation_id: str | None = None,
+        poll_interval: float = 5.0,
+        timeout: float = 600.0,
+    ) -> GeneratedEvaluatorRef:
+        """Generate a Foundry rubric evaluator from an agent or workflow.
+
+        Drives the Foundry evaluator-generation long-running operation
+        (``client.beta.evaluators.create_generation_job``) end-to-end and
+        returns a pinned :class:`GeneratedEvaluatorRef` for use with
+        :class:`FoundryEvals` ``evaluators=`` lists.
+
+        Exactly one of ``agent``, ``workflow``, or ``sources`` must be
+        supplied.  When ``agent`` or ``workflow`` is given,
+        :func:`agent_as_eval_source` / :func:`workflow_as_eval_source` is
+        used to build a single conservative source (instructions and
+        tools included; examples and context providers excluded).  Pass
+        ``sources=`` directly to control inclusion explicitly or to
+        provide multiple sources.
+
+        Requires ``azure-ai-projects`` with the rubric-generation APIs
+        (currently ``2.3.0a*`` on the Azure SDK dev feed; tracked for an
+        upcoming PyPI release).  Raises :class:`NotImplementedError` with
+        a clear message when the dependency is unavailable.
+
+        Keyword Args:
+            project_client: Async ``AIProjectClient`` for the target
+                Foundry project.
+            name: Evaluator name to register in the project.  Must be a
+                stable identifier (e.g. ``"policy-enforcement-v1"``).
+            agent: Optional ``BaseAgent`` to derive a source from.
+            workflow: Optional ``Workflow`` to derive a source from.
+            sources: Explicit list of :class:`EvalGenerationSource`
+                instances.  Mutually exclusive with ``agent`` / ``workflow``.
+            category: ``"quality"`` or ``"safety"``.  Defaults to
+                ``"quality"``.
+            model: Optional model deployment to drive generation.  When
+                omitted the service picks a default.
+            display_name: Optional human-readable name for the evaluator.
+            description: Optional description for the evaluator.
+            operation_id: Optional caller-supplied operation id to make
+                the create call idempotent.
+            poll_interval: Seconds between job-status polls.
+            timeout: Maximum seconds to wait for the job to complete.
+
+        Returns:
+            A pinned :class:`GeneratedEvaluatorRef` referring to the
+            newly created evaluator.
+
+        Raises:
+            ValueError: If the source arguments are inconsistent.
+            NotImplementedError: If the installed ``azure-ai-projects``
+                version does not expose the rubric APIs.
+            TimeoutError: If the job does not complete within ``timeout``.
+            RuntimeError: If the generation job ends in a non-succeeded
+                terminal state.
+        """
+        resolved_sources = _coalesce_generation_sources(agent=agent, workflow=workflow, sources=sources)
+
+        try:
+            sdk_types = _import_generation_sdk_types()
+        except _RubricSdkUnavailableError as exc:
+            raise NotImplementedError(str(exc)) from exc
+
+        sdk_sources = [_to_sdk_source(s, sdk_types) for s in resolved_sources]
+
+        inputs_kwargs: dict[str, Any] = {
+            "name": name,
+            "category": category,
+            "sources": sdk_sources,
+        }
+        if model is not None:
+            inputs_kwargs["model"] = model
+        if display_name is not None:
+            inputs_kwargs["display_name"] = display_name
+        if description is not None:
+            inputs_kwargs["description"] = description
+
+        inputs = sdk_types.EvaluatorGenerationInputs(**inputs_kwargs)
+        job = sdk_types.EvaluatorGenerationJob(inputs=inputs)
+
+        create_kwargs: dict[str, Any] = {"job": job}
+        if operation_id is not None:
+            create_kwargs["operation_id"] = operation_id
+
+        evaluators_ops = _get_beta_evaluators(project_client)
+        created = await evaluators_ops.create_generation_job(**create_kwargs)
+        completed = await _poll_generation_job(
+            evaluators_ops,
+            created,
+            poll_interval=poll_interval,
+            timeout=timeout,
+        )
+
+        return _generation_job_to_ref(completed, category=category)
+
+
+_TERMINAL_GENERATION_STATUSES: frozenset[str] = frozenset({"succeeded", "failed", "cancelled", "canceled"})
+
+
+class _RubricSdkUnavailableError(Exception):
+    """Raised when azure-ai-projects lacks the rubric-generation APIs."""
+
+
+@dataclass(frozen=True)
+class _GenerationSdkTypes:
+    """Resolved SDK type handles for rubric-evaluator generation."""
+
+    EvaluatorGenerationInputs: Any
+    EvaluatorGenerationJob: Any
+    PromptSource: Any
+    AgentSource: Any | None
+    DatasetSource: Any | None
+    TracesSource: Any | None
+
+
+_RUBRIC_SDK_MISSING_MSG = (
+    "FoundryEvals.generate_rubric requires the rubric-evaluator generation APIs "
+    "from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev feed). "
+    "Install a build that exposes "
+    "`azure.ai.projects.models.EvaluatorGenerationInputs` and "
+    "`AIProjectClient.beta.evaluators.create_generation_job`."
+)
+
+
+def _import_generation_sdk_types() -> _GenerationSdkTypes:
+    """Lazily resolve the rubric-generation SDK types from azure-ai-projects."""
+    try:
+        from azure.ai.projects import models as _models  # type: ignore[import-not-found]
+    except ImportError as exc:
+        raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG) from exc
+
+    models_mod: Any = _models
+    inputs_cls: Any = getattr(models_mod, "EvaluatorGenerationInputs", None)
+    job_cls: Any = getattr(models_mod, "EvaluatorGenerationJob", None)
+    prompt_cls: Any = getattr(models_mod, "PromptEvaluatorGenerationJobSource", None)
+    if inputs_cls is None or job_cls is None or prompt_cls is None:
+        raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG)
+
+    agent_cls: Any = getattr(models_mod, "AgentEvaluatorGenerationJobSource", None)
+    dataset_cls: Any = getattr(models_mod, "DatasetEvaluatorGenerationJobSource", None)
+    traces_cls: Any = getattr(models_mod, "TracesEvaluatorGenerationJobSource", None)
+
+    return _GenerationSdkTypes(
+        EvaluatorGenerationInputs=inputs_cls,
+        EvaluatorGenerationJob=job_cls,
+        PromptSource=prompt_cls,
+        AgentSource=agent_cls,
+        DatasetSource=dataset_cls,
+        TracesSource=traces_cls,
+    )
+
+
+def _get_beta_evaluators(project_client: AIProjectClient) -> Any:
+    """Return the ``project_client.beta.evaluators`` operations group, or raise."""
+    beta = getattr(project_client, "beta", None)
+    evaluators_ops = getattr(beta, "evaluators", None) if beta is not None else None
+    if evaluators_ops is None:
+        raise NotImplementedError(_RUBRIC_SDK_MISSING_MSG)
+    return evaluators_ops
+
+
+def _coalesce_generation_sources(
+    *,
+    agent: BaseAgent | None,
+    workflow: Workflow | None,
+    sources: Sequence[EvalGenerationSource] | None,
+) -> list[EvalGenerationSource]:
+    if sources is not None and not sources:
+        raise ValueError("sources= must contain at least one EvalGenerationSource.")
+    supplied = [bool(agent), bool(workflow), bool(sources)]
+    if sum(supplied) == 0:
+        raise ValueError("Provide one of agent=, workflow=, or sources=.")
+    if sum(supplied) > 1:
+        raise ValueError("Provide only one of agent=, workflow=, or sources=.")
+    if sources is not None:
+        return list(sources)
+    if agent is not None:
+        return [agent_as_eval_source(agent)]
+    if workflow is None:
+        raise ValueError("workflow= must be provided when agent= and sources= are not set.")
+    return [workflow_as_eval_source(workflow)]
+
+
+def _to_sdk_source(source: EvalGenerationSource, sdk_types: _GenerationSdkTypes) -> Any:
+    """Translate an :class:`EvalGenerationSource` to its SDK counterpart."""
+    if source.type == "prompt":
+        if not source.prompt:
+            raise ValueError("EvalGenerationSource(type='prompt') requires a non-empty prompt.")
+        kwargs: dict[str, Any] = {"prompt": source.prompt}
+        if source.description is not None:
+            kwargs["description"] = source.description
+        return sdk_types.PromptSource(**kwargs)
+    if source.type == "agent":
+        if sdk_types.AgentSource is None:
+            raise NotImplementedError("Installed azure-ai-projects does not expose AgentEvaluatorGenerationJobSource.")
+        if not source.agent_name:
+            raise ValueError("EvalGenerationSource(type='agent') requires agent_name.")
+        kwargs = {"agent_name": source.agent_name}
+        if source.description is not None:
+            kwargs["description"] = source.description
+        return sdk_types.AgentSource(**kwargs)
+    if source.type == "dataset":
+        if sdk_types.DatasetSource is None:
+            raise NotImplementedError(
+                "Installed azure-ai-projects does not expose DatasetEvaluatorGenerationJobSource."
+            )
+        if not source.dataset_name:
+            raise ValueError("EvalGenerationSource(type='dataset') requires dataset_name.")
+        kwargs = {"dataset_name": source.dataset_name}
+        if source.dataset_version is not None:
+            kwargs["dataset_version"] = source.dataset_version
+        if source.description is not None:
+            kwargs["description"] = source.description
+        return sdk_types.DatasetSource(**kwargs)
+    if source.type == "traces":
+        if sdk_types.TracesSource is None:
+            raise NotImplementedError("Installed azure-ai-projects does not expose TracesEvaluatorGenerationJobSource.")
+        kwargs = {}
+        if source.metadata is not None:
+            kwargs["metadata"] = source.metadata
+        if source.description is not None:
+            kwargs["description"] = source.description
+        return sdk_types.TracesSource(**kwargs)
+    raise ValueError(f"Unknown EvalGenerationSource type: {source.type!r}")
+
+
+async def _poll_generation_job(
+    evaluators_ops: Any,
+    job: Any,
+    *,
+    poll_interval: float,
+    timeout: float,
+) -> Any:
+    """Poll a rubric-generation job until it reaches a terminal state."""
+    job_id = getattr(job, "id", None)
+    if not job_id:
+        raise RuntimeError("Rubric generation job did not return an id.")
+
+    deadline = asyncio.get_event_loop().time() + timeout
+    current = job
+    while True:
+        status = (getattr(current, "status", "") or "").lower()
+        if status in _TERMINAL_GENERATION_STATUSES:
+            if status != "succeeded":
+                err = getattr(current, "error", None)
+                err_msg = getattr(err, "message", None) or str(err) if err is not None else status
+                raise RuntimeError(f"Rubric generation job {job_id} ended in status {status!r}: {err_msg}")
+            return current
+        if asyncio.get_event_loop().time() >= deadline:
+            raise TimeoutError(
+                f"Rubric generation job {job_id} did not complete within {timeout}s (last status: {status!r})."
+            )
+        await asyncio.sleep(poll_interval)
+        current = await evaluators_ops.get_generation_job(job_id)
+
+
+def _generation_job_to_ref(job: Any, *, category: str) -> GeneratedEvaluatorRef:
+    """Build a pinned :class:`GeneratedEvaluatorRef` from a completed job."""
+    artifacts: Any = getattr(job, "artifacts", None)
+    evaluator: Any = getattr(artifacts, "evaluator", None) if artifacts is not None else None
+    if evaluator is None:
+        raise RuntimeError("Rubric generation job completed without an evaluator artifact.")
+
+    ev_name = getattr(evaluator, "name", None)
+    ev_version = getattr(evaluator, "version", None)
+    if not ev_name:
+        raise RuntimeError("Generated evaluator artifact is missing a name.")
+    if ev_version is None:
+        raise RuntimeError("Generated evaluator artifact is missing a version.")
+
+    definition: Any = getattr(evaluator, "definition", None)
+    dimensions_raw: Any = getattr(definition, "dimensions", None) if definition is not None else None
+    dimensions: tuple[RubricDimension, ...] | None = None
+    if dimensions_raw:
+        parsed: list[RubricDimension] = []
+        for entry in dimensions_raw:
+            try:
+                parsed.append(
+                    RubricDimension(
+                        id=str(getattr(entry, "id", "") or ""),
+                        description=str(getattr(entry, "description", "") or ""),
+                        weight=int(getattr(entry, "weight", 0) or 0),
+                        always_applicable=bool(getattr(entry, "always_applicable", False)),
+                    )
+                )
+            except (TypeError, ValueError):
+                logger.debug("Skipping malformed dimension on generated evaluator", exc_info=True)
+        if parsed:
+            dimensions = tuple(parsed)
+
+    pass_threshold: float | None = None
+    if definition is not None:
+        raw_threshold = getattr(definition, "pass_threshold", None)
+        if isinstance(raw_threshold, (int, float)):
+            pass_threshold = float(raw_threshold)
+
+    valid_category: str
+    valid_category = category if category in ("quality", "safety") else "quality"
+
+    return GeneratedEvaluatorRef(
+        name=str(ev_name),
+        version=str(ev_version),
+        category=cast("Any", valid_category),
+        display_name=getattr(evaluator, "display_name", None),
+        description=getattr(evaluator, "description", None),
+        dimensions=dimensions,
+        pass_threshold=pass_threshold,
+    )
+
 
 # ---------------------------------------------------------------------------
 # Foundry-specific functions (not part of the Evaluator protocol)
@@ -987,7 +1469,7 @@ async def _evaluate_via_dataset(
 @experimental(feature_id=ExperimentalFeature.EVALS)
 async def evaluate_traces(
     *,
-    evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None,
+    evaluators: Sequence[str] | None = None,
     client: FoundryChatClient | None = None,
     project_client: AIProjectClient | None = None,
     model: str,
@@ -1080,7 +1562,7 @@ async def evaluate_foundry_target(
     *,
     target: dict[str, Any],
     test_queries: Sequence[str],
-    evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None,
+    evaluators: Sequence[str] | None = None,
     client: FoundryChatClient | None = None,
     project_client: AIProjectClient | None = None,
     model: str,
@@ -1096,9 +1578,7 @@ async def evaluate_foundry_target(
     Args:
         target: Target configuration dict.
         test_queries: Queries for Foundry to send to the target.
-        evaluators: Evaluator names (built-in shorts / fully-qualified
-            ``builtin.*`` names) or :class:`GeneratedEvaluatorRef`
-            instances for generated rubric evaluators.
+        evaluators: Evaluator names.
         client: A ``FoundryChatClient`` instance. Provide this or *project_client*.
         project_client: An ``AIProjectClient`` instance.
         model: Model deployment name for the evaluator LLM judge.
diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index 7502726d1ad..16dc9d50ce7 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -64,6 +64,32 @@ def _make_tool(name: str) -> MagicMock:
     return t
 
 
+def _make_stub_agent(
+    *,
+    name: str = "alpha",
+    description: str = "An agent.",
+    instructions: str = "Be brief.",
+) -> MagicMock:
+    """Mock agent whose as_eval_source returns a real dossier string."""
+    from agent_framework._evaluation import _render_agent_dossier
+
+    agent = MagicMock()
+    agent.name = name
+    agent.description = description
+    agent.default_options = {"instructions": instructions, "tools": []}
+    agent.context_providers = []
+    agent.mcp_tools = []
+    agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier(
+        agent,
+        include_instructions=kw.get("include_instructions", True),
+        include_tools=kw.get("include_tools", True),
+        include_context_providers=kw.get("include_context_providers", False),
+        include_examples=kw.get("include_examples", False),
+        examples=kw.get("examples"),
+    )
+    return agent
+
+
 @dataclass
 class _MockResultCounts:
     """Mock matching the OpenAI SDK ResultCounts Pydantic model shape."""
@@ -806,12 +832,6 @@ def test_all_tool_evaluators_include_tool_definitions(self) -> None:
         for c in criteria:
             assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions"
 
-
-# ---------------------------------------------------------------------------
-# _build_item_schema
-# ---------------------------------------------------------------------------
-
-
     def test_generated_evaluator_ref_pinned_version(self) -> None:
         from agent_framework_foundry import GeneratedEvaluatorRef
 
@@ -1336,12 +1356,6 @@ def test_raises_when_all_filtered(self) -> None:
                 items,
             )
 
-
-# ---------------------------------------------------------------------------
-# EvalResults
-# ---------------------------------------------------------------------------
-
-
     def test_preserves_generated_ref_when_no_tools(self) -> None:
         from agent_framework_foundry import GeneratedEvaluatorRef
 
@@ -2473,12 +2487,6 @@ async def test_handles_api_failure_gracefully(self) -> None:
         items = await _fetch_output_items(mock_client, "eval_1", "run_1")
         assert items == []
 
-
-# ---------------------------------------------------------------------------
-# _poll_eval_run — timeout / failed / canceled paths
-# ---------------------------------------------------------------------------
-
-
     async def test_extracts_rubric_scores_from_dict_sample(self) -> None:
         from agent_framework_foundry._foundry_evals import _fetch_output_items
 
@@ -2521,6 +2529,30 @@ async def test_extracts_rubric_scores_from_dict_sample(self) -> None:
         assert safety.score is None
         assert safety.applicable is False
 
+    async def test_no_rubric_scores_when_absent(self) -> None:
+        from agent_framework_foundry._foundry_evals import _fetch_output_items
+
+        mock_result = MagicMock()
+        mock_result.name = "relevance"
+        mock_result.score = 0.85
+        mock_result.passed = True
+        mock_result.sample = None
+
+        mock_oi = MagicMock()
+        mock_oi.id = "oi_2"
+        mock_oi.status = "pass"
+        mock_oi.results = [mock_result]
+        mock_oi.sample = None
+        mock_oi.datasource_item = {}
+
+        mock_client = MagicMock()
+        mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi]))
+
+        items = await _fetch_output_items(mock_client, "eval_1", "run_1")
+
+        assert items[0].scores[0].dimensions is None
+
+
 class TestExtractRubricScores:
     def test_handles_attribute_style_properties(self) -> None:
         from agent_framework_foundry._foundry_evals import _extract_rubric_scores
@@ -2962,3 +2994,394 @@ async def test_target_without_type_raises(self) -> None:
                 client=mock_client,
                 model="gpt-4o",
             )
+
+
+class TestFoundryAgentAsEvalSource:
+    """Tests for foundry's agent_as_eval_source helper (wraps BaseAgent.as_eval_source)."""
+
+    def test_returns_prompt_source_with_dossier(self) -> None:
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
+        source = agent_as_eval_source(agent)
+        assert source.type == "prompt"
+        assert source.description == "Looks up the weather."
+        assert source.prompt is not None
+        assert "Agent name: weather-bot" in source.prompt
+        assert "Be brief." in source.prompt
+
+    def test_hosted_agent_name_emits_agent_source(self) -> None:
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
+        source = agent_as_eval_source(agent, hosted_agent_name="weather-bot-hosted-id")
+        assert source.type == "agent"
+        assert source.agent_name == "weather-bot-hosted-id"
+        assert source.prompt is None
+        assert source.description == "Looks up the weather."
+
+    def test_forwards_keyword_options_to_agent(self) -> None:
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent()
+        source = agent_as_eval_source(agent, include_instructions=False)
+        assert source.prompt is not None
+        assert "Instructions:" not in source.prompt
+
+
+class TestFoundryWorkflowAsEvalSource:
+    """Tests for foundry's workflow_as_eval_source helper (wraps Workflow.as_eval_source)."""
+
+    def _make_workflow(self) -> MagicMock:
+        from agent_framework._evaluation import _render_workflow_dossier
+
+        workflow = MagicMock()
+        workflow.name = "demo-workflow"
+        workflow.description = "Routes user questions."
+        workflow.to_dict.return_value = {
+            "name": "demo-workflow",
+            "id": "wf_1",
+            "executors": {},
+            "edge_groups": [],
+        }
+        workflow.executors = {}
+        workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier(
+            workflow,
+            include_instructions=kw.get("include_instructions", True),
+            include_tools=kw.get("include_tools", True),
+            include_context_providers=kw.get("include_context_providers", False),
+            include_examples=kw.get("include_examples", False),
+            examples=kw.get("examples"),
+            include_topology=kw.get("include_topology", True),
+        )
+        return workflow
+
+    def test_returns_prompt_source_with_topology(self) -> None:
+        from agent_framework_foundry._foundry_evals import workflow_as_eval_source
+
+        workflow = self._make_workflow()
+        source = workflow_as_eval_source(workflow)
+        assert source.type == "prompt"
+        assert source.description == "Routes user questions."
+        assert source.prompt is not None
+        assert "Workflow name: demo-workflow" in source.prompt
+        assert "Topology (JSON):" in source.prompt
+
+    def test_topology_can_be_disabled(self) -> None:
+        from agent_framework_foundry._foundry_evals import workflow_as_eval_source
+
+        workflow = self._make_workflow()
+        source = workflow_as_eval_source(workflow, include_topology=False)
+        assert source.prompt is not None
+        assert "Topology (JSON):" not in source.prompt
+
+
+class TestCoalesceGenerationSources:
+    """Validation for the source-resolution helper used by FoundryEvals.generate_rubric."""
+
+    def test_requires_exactly_one_source(self) -> None:
+        from agent_framework_foundry._foundry_evals import _coalesce_generation_sources
+
+        with pytest.raises(ValueError, match="Provide one of"):
+            _coalesce_generation_sources(agent=None, workflow=None, sources=None)
+
+    def test_rejects_multiple_sources(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _coalesce_generation_sources
+
+        agent = MagicMock()
+        agent.name = "a"
+        agent.description = None
+        agent.default_options = {"instructions": "x", "tools": []}
+        agent.context_providers = []
+        agent.mcp_tools = []
+        with pytest.raises(ValueError, match="only one of"):
+            _coalesce_generation_sources(
+                agent=agent,
+                workflow=None,
+                sources=[EvalGenerationSource(type="prompt", prompt="hi")],
+            )
+
+    def test_uses_agent_helper_when_only_agent_supplied(self) -> None:
+        from agent_framework_foundry._foundry_evals import _coalesce_generation_sources
+
+        agent = _make_stub_agent(name="alpha", description="An agent.")
+
+        sources = _coalesce_generation_sources(agent=agent, workflow=None, sources=None)
+        assert len(sources) == 1
+        assert sources[0].type == "prompt"
+        assert sources[0].prompt is not None
+        assert "Agent name: alpha" in sources[0].prompt
+
+    def test_rejects_empty_sources_list(self) -> None:
+        from agent_framework_foundry._foundry_evals import _coalesce_generation_sources
+
+        with pytest.raises(ValueError, match="at least one"):
+            _coalesce_generation_sources(agent=None, workflow=None, sources=[])
+
+
+class TestToSdkSource:
+    """Translation between EvalGenerationSource and SDK *JobSource types."""
+
+    def _make_sdk_types(self, *, with_agent: bool = True, with_dataset: bool = True, with_traces: bool = True) -> Any:
+        from agent_framework_foundry._foundry_evals import _GenerationSdkTypes
+
+        return _GenerationSdkTypes(
+            EvaluatorGenerationInputs=MagicMock(),
+            EvaluatorGenerationJob=MagicMock(),
+            PromptSource=MagicMock(name="PromptSource"),
+            AgentSource=MagicMock(name="AgentSource") if with_agent else None,
+            DatasetSource=MagicMock(name="DatasetSource") if with_dataset else None,
+            TracesSource=MagicMock(name="TracesSource") if with_traces else None,
+        )
+
+    def test_prompt_source_is_translated(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        sdk.PromptSource.return_value = "prompt-sdk-instance"
+        out = _to_sdk_source(
+            EvalGenerationSource(type="prompt", prompt="hello", description="d"),
+            sdk,
+        )
+        assert out == "prompt-sdk-instance"
+        sdk.PromptSource.assert_called_once_with(prompt="hello", description="d")
+
+    def test_prompt_without_text_raises(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        with pytest.raises(ValueError, match="non-empty prompt"):
+            _to_sdk_source(EvalGenerationSource(type="prompt"), sdk)
+
+    def test_agent_source_is_translated(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        sdk.AgentSource.return_value = "agent-sdk-instance"
+        out = _to_sdk_source(
+            EvalGenerationSource(type="agent", agent_name="my-hosted-agent"),
+            sdk,
+        )
+        assert out == "agent-sdk-instance"
+        sdk.AgentSource.assert_called_once_with(agent_name="my-hosted-agent")
+
+    def test_agent_source_requires_name(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        with pytest.raises(ValueError, match="agent_name"):
+            _to_sdk_source(EvalGenerationSource(type="agent"), sdk)
+
+    def test_agent_source_raises_when_sdk_missing(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types(with_agent=False)
+        with pytest.raises(NotImplementedError, match="AgentEvaluatorGenerationJobSource"):
+            _to_sdk_source(
+                EvalGenerationSource(type="agent", agent_name="x"),
+                sdk,
+            )
+
+    def test_dataset_source_is_translated(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        sdk.DatasetSource.return_value = "dataset-sdk-instance"
+        out = _to_sdk_source(
+            EvalGenerationSource(type="dataset", dataset_name="ds", dataset_version="1"),
+            sdk,
+        )
+        assert out == "dataset-sdk-instance"
+        sdk.DatasetSource.assert_called_once_with(dataset_name="ds", dataset_version="1")
+
+
+class TestPollGenerationJob:
+    """Behavior of the rubric-generation polling loop."""
+
+    async def test_returns_immediately_on_succeeded(self) -> None:
+        from agent_framework_foundry._foundry_evals import _poll_generation_job
+
+        evaluators_ops = MagicMock()
+        evaluators_ops.get_generation_job = AsyncMock()
+        job = MagicMock(id="job_1", status="succeeded")
+        out = await _poll_generation_job(evaluators_ops, job, poll_interval=0.01, timeout=1.0)
+        assert out is job
+        evaluators_ops.get_generation_job.assert_not_called()
+
+    async def test_polls_until_terminal(self) -> None:
+        from agent_framework_foundry._foundry_evals import _poll_generation_job
+
+        running = MagicMock(id="job_1", status="running")
+        succeeded = MagicMock(id="job_1", status="succeeded")
+        evaluators_ops = MagicMock()
+        evaluators_ops.get_generation_job = AsyncMock(side_effect=[running, succeeded])
+
+        initial = MagicMock(id="job_1", status="running")
+        out = await _poll_generation_job(evaluators_ops, initial, poll_interval=0.001, timeout=1.0)
+        assert out is succeeded
+        assert evaluators_ops.get_generation_job.await_count == 2
+
+    async def test_failed_status_raises(self) -> None:
+        from agent_framework_foundry._foundry_evals import _poll_generation_job
+
+        err = MagicMock(message="boom")
+        terminal = MagicMock(id="job_1", status="failed", error=err)
+        evaluators_ops = MagicMock()
+        evaluators_ops.get_generation_job = AsyncMock(return_value=terminal)
+
+        with pytest.raises(RuntimeError, match="boom"):
+            await _poll_generation_job(
+                evaluators_ops,
+                MagicMock(id="job_1", status="running"),
+                poll_interval=0.001,
+                timeout=1.0,
+            )
+
+    async def test_timeout_raises(self) -> None:
+        from agent_framework_foundry._foundry_evals import _poll_generation_job
+
+        running = MagicMock(id="job_1", status="running")
+        evaluators_ops = MagicMock()
+        evaluators_ops.get_generation_job = AsyncMock(return_value=running)
+
+        with pytest.raises(TimeoutError):
+            await _poll_generation_job(evaluators_ops, running, poll_interval=0.001, timeout=0.005)
+
+
+class TestGenerationJobToRef:
+    """Translation of a completed generation job to a GeneratedEvaluatorRef."""
+
+    def test_builds_pinned_ref_with_dimensions(self) -> None:
+        from agent_framework_foundry._foundry_evals import RubricDimension, _generation_job_to_ref
+
+        dim = MagicMock(id="d1", description="dim", weight=2, always_applicable=True)
+        definition = MagicMock(dimensions=[dim], pass_threshold=0.75)
+        evaluator = MagicMock(
+            name="my-eval",
+            version=3,
+            display_name="My Eval",
+            description="A custom rubric.",
+            definition=definition,
+        )
+        evaluator.name = "my-eval"
+        job = MagicMock(artifacts=MagicMock(evaluator=evaluator))
+
+        ref = _generation_job_to_ref(job, category="quality")
+        assert ref.name == "my-eval"
+        assert ref.version == "3"
+        assert ref.display_name == "My Eval"
+        assert ref.description == "A custom rubric."
+        assert ref.category == "quality"
+        assert ref.pass_threshold == 0.75
+        assert ref.dimensions is not None
+        assert ref.dimensions[0] == RubricDimension(id="d1", description="dim", weight=2, always_applicable=True)
+
+    def test_missing_artifacts_raises(self) -> None:
+        from agent_framework_foundry._foundry_evals import _generation_job_to_ref
+
+        job = MagicMock(artifacts=None)
+        with pytest.raises(RuntimeError, match="evaluator artifact"):
+            _generation_job_to_ref(job, category="quality")
+
+
+class TestGenerateRubricSdkMissing:
+    """generate_rubric raises NotImplementedError when SDK lacks the rubric APIs."""
+
+    async def test_raises_when_sdk_types_unavailable(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from agent_framework_foundry import _foundry_evals as fm
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource
+
+        def _raise() -> Any:
+            raise fm._RubricSdkUnavailableError(fm._RUBRIC_SDK_MISSING_MSG)
+
+        monkeypatch.setattr(fm, "_import_generation_sdk_types", _raise)
+
+        project_client = MagicMock()
+
+        with pytest.raises(NotImplementedError, match="rubric"):
+            await FoundryEvals.generate_rubric(
+                project_client=project_client,
+                name="my-eval",
+                sources=[EvalGenerationSource(type="prompt", prompt="hi")],
+            )
+
+
+class TestGenerateRubricE2E:
+    """End-to-end happy path for generate_rubric with mocked SDK."""
+
+    async def test_generate_rubric_from_agent(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from agent_framework_foundry import _foundry_evals as fm
+
+        # Stub SDK type handles
+        prompt_cls = MagicMock(name="PromptSource")
+        prompt_cls.return_value = "sdk-prompt"
+        inputs_cls = MagicMock(name="EvaluatorGenerationInputs")
+        inputs_cls.return_value = "sdk-inputs"
+        job_cls = MagicMock(name="EvaluatorGenerationJob")
+        job_cls.return_value = "sdk-job"
+
+        sdk_types = fm._GenerationSdkTypes(
+            EvaluatorGenerationInputs=inputs_cls,
+            EvaluatorGenerationJob=job_cls,
+            PromptSource=prompt_cls,
+            AgentSource=None,
+            DatasetSource=None,
+            TracesSource=None,
+        )
+        monkeypatch.setattr(fm, "_import_generation_sdk_types", lambda: sdk_types)
+
+        # Mock the SDK operations and completed job
+        completed_evaluator = MagicMock(version="7", display_name=None, description=None)
+        completed_evaluator.name = "agent-rubric"
+        completed_evaluator.definition = MagicMock(dimensions=[], pass_threshold=None)
+        completed = MagicMock(
+            id="job_42",
+            status="succeeded",
+            artifacts=MagicMock(evaluator=completed_evaluator),
+        )
+
+        evaluators_ops = MagicMock()
+        evaluators_ops.create_generation_job = AsyncMock(return_value=completed)
+        evaluators_ops.get_generation_job = AsyncMock(return_value=completed)
+        project_client = MagicMock()
+        project_client.beta = MagicMock(evaluators=evaluators_ops)
+
+        # Build a stub agent
+        agent = _make_stub_agent(
+            name="weather-bot",
+            description="Looks up weather.",
+            instructions="Be brief.",
+        )
+
+        ref = await FoundryEvals.generate_rubric(
+            project_client=project_client,
+            name="agent-rubric",
+            agent=agent,
+            category="quality",
+            model="gpt-4o",
+            display_name="Display",
+            description="Desc",
+            operation_id="op-123",
+        )
+
+        assert ref.name == "agent-rubric"
+        assert ref.version == "7"
+        assert ref.category == "quality"
+
+        # Verify inputs/job/source assembly
+        prompt_cls.assert_called_once()
+        prompt_kwargs = prompt_cls.call_args.kwargs
+        assert "Agent name: weather-bot" in prompt_kwargs["prompt"]
+        assert prompt_kwargs["description"] == "Looks up weather."
+
+        inputs_cls.assert_called_once()
+        inputs_kwargs = inputs_cls.call_args.kwargs
+        assert inputs_kwargs["name"] == "agent-rubric"
+        assert inputs_kwargs["category"] == "quality"
+        assert inputs_kwargs["model"] == "gpt-4o"
+        assert inputs_kwargs["display_name"] == "Display"
+        assert inputs_kwargs["description"] == "Desc"
+        assert inputs_kwargs["sources"] == ["sdk-prompt"]
+
+        job_cls.assert_called_once_with(inputs="sdk-inputs")
+        evaluators_ops.create_generation_job.assert_awaited_once_with(job="sdk-job", operation_id="op-123")

From 4c7f94f665562860d660c707d2ad5418db96c08d Mon Sep 17 00:00:00 2001
From: Ben Thomas <25218250+alliscode@users.noreply.github.com>
Date: Wed, 27 May 2026 08:22:00 -0700
Subject: [PATCH 06/16] Python: feat(foundry-evals): YAML config loader +
 sample

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../agent_framework_foundry/__init__.py       |  12 +
 .../agent_framework_foundry/_evals_config.py  | 403 ++++++++++++++++++
 .../foundry/tests/test_evals_config.py        | 273 ++++++++++++
 .../evaluate_with_generated_rubric_sample.py  | 151 +++++++
 .../evaluation/foundry_evals/evaluators.yaml  |  11 +
 5 files changed, 850 insertions(+)
 create mode 100644 python/packages/foundry/agent_framework_foundry/_evals_config.py
 create mode 100644 python/packages/foundry/tests/test_evals_config.py
 create mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py
 create mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml

diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py
index cafe30eb955..efbe0b8d248 100644
--- a/python/packages/foundry/agent_framework_foundry/__init__.py
+++ b/python/packages/foundry/agent_framework_foundry/__init__.py
@@ -10,6 +10,13 @@
     FoundryEmbeddingSettings,
     RawFoundryEmbeddingClient,
 )
+from ._evals_config import (
+    RubricGenerationSpec,
+    RubricSourceSpec,
+    build_sources,
+    load_evals_config,
+    parse_evals_config,
+)
 from ._foundry_evals import (
     EvalGenerationSource,
     FoundryEvals,
@@ -44,9 +51,14 @@
     "RawFoundryChatClient",
     "RawFoundryEmbeddingClient",
     "RubricDimension",
+    "RubricGenerationSpec",
+    "RubricSourceSpec",
     "__version__",
     "agent_as_eval_source",
+    "build_sources",
     "evaluate_foundry_target",
     "evaluate_traces",
+    "load_evals_config",
+    "parse_evals_config",
     "workflow_as_eval_source",
 ]
diff --git a/python/packages/foundry/agent_framework_foundry/_evals_config.py b/python/packages/foundry/agent_framework_foundry/_evals_config.py
new file mode 100644
index 00000000000..5f45e2854b8
--- /dev/null
+++ b/python/packages/foundry/agent_framework_foundry/_evals_config.py
@@ -0,0 +1,403 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""YAML-driven evaluator configuration for rubric generation and evaluation.
+
+Defines the source-controlled config schema described in
+``adaptive-evals-draft.md``: a list of named rubric-generation specs that
+CI jobs and harnesses parse to drive
+:meth:`FoundryEvals.generate_rubric`.
+
+Example config:
+
+.. code-block:: yaml
+
+    evaluators:
+      reservation-agent-quality:
+        type: foundry.generated_rubric
+        category: quality
+        model: gpt-4o
+        agent: reservation-agent
+        sources:
+          - type: agent
+            include_instructions: true
+            include_tools: true
+          - type: dataset
+            name: reservation-business-rules
+            version: "1"
+
+Example loader usage:
+
+.. code-block:: python
+
+    from agent_framework_foundry import load_evals_config, FoundryEvals
+
+    config = load_evals_config("evaluators.yaml")
+    spec = config["reservation-agent-quality"]
+    sources = build_sources(spec, agent=agent)
+    ref = await FoundryEvals.generate_rubric(
+        project_client=client,
+        name=spec.name,
+        sources=sources,
+        category=spec.category,
+        model=spec.model,
+        display_name=spec.display_name,
+        description=spec.description,
+    )
+"""
+
+from __future__ import annotations
+
+import os
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Literal, cast
+
+from agent_framework._feature_stage import ExperimentalFeature, experimental
+
+from ._foundry_evals import (
+    EvalGenerationSource,
+    agent_as_eval_source,
+    workflow_as_eval_source,
+)
+
+_RUBRIC_TYPE = "foundry.generated_rubric"
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class RubricSourceSpec:
+    """A single source entry in a :class:`RubricGenerationSpec` ``sources`` list.
+
+    Mirrors the per-source YAML schema.  The :attr:`type` field is the
+    discriminator; only the fields relevant to each type are read.
+
+    Attributes:
+        type: One of ``"agent"``, ``"workflow"``, ``"prompt"``,
+            ``"dataset"``, ``"traces"``.
+        description: Optional description shown in Foundry UI.
+        include_instructions: Whether to include the bound agent /
+            workflow's instructions.  Applies to ``"agent"`` and
+            ``"workflow"`` types.
+        include_tools: Whether to include the bound agent / workflow's
+            tools.  Applies to ``"agent"`` and ``"workflow"`` types.
+        include_context_providers: Whether to include attached
+            context-provider class names.  Applies to ``"agent"`` and
+            ``"workflow"`` types.
+        include_examples: Whether to include ``examples``.  Applies to
+            ``"agent"`` and ``"workflow"`` types.
+        include_topology: Whether to include the JSON-encoded topology.
+            Applies to ``"workflow"`` type.
+        examples: Optional list of example queries for ``"agent"`` /
+            ``"workflow"`` sources.
+        prompt: Rendered dossier for ``"prompt"`` type.
+        agent_name: Hosted Foundry agent name for ``"agent"`` type with
+            a server-side reference.
+        name: Dataset name for ``"dataset"`` type.
+        version: Pinned dataset version.
+        metadata: Free-form metadata for ``"traces"`` sources.
+    """
+
+    type: Literal["agent", "workflow", "prompt", "dataset", "traces"]
+    description: str | None = None
+    include_instructions: bool = True
+    include_tools: bool = True
+    include_context_providers: bool = False
+    include_examples: bool = False
+    include_topology: bool = True
+    examples: tuple[str, ...] = field(default_factory=tuple)
+    prompt: str | None = None
+    agent_name: str | None = None
+    name: str | None = None
+    version: str | None = None
+    metadata: dict[str, Any] | None = None
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class RubricGenerationSpec:
+    """A single named entry from an evaluators YAML config.
+
+    Attributes:
+        name: Evaluator name (the YAML key under ``evaluators``).
+        type: Discriminator literal.  Must be
+            ``"foundry.generated_rubric"`` for rubric evaluators.
+        category: ``"quality"`` or ``"safety"``.
+        model: Optional model deployment to drive generation.
+        agent: Optional symbolic reference to the agent in the
+            caller's harness.  Resolved by user code into a
+            :class:`BaseAgent` and passed to
+            :func:`build_sources`.
+        workflow: Optional symbolic reference to a workflow.
+        display_name: Optional human-readable name.
+        description: Optional description.
+        sources: List of source specs to feed into generation.  When
+            empty, callers typically default to a single
+            ``RubricSourceSpec(type='agent')`` or
+            ``RubricSourceSpec(type='workflow')`` source.
+    """
+
+    name: str
+    type: str = _RUBRIC_TYPE
+    category: Literal["quality", "safety"] = "quality"
+    model: str | None = None
+    agent: str | None = None
+    workflow: str | None = None
+    display_name: str | None = None
+    description: str | None = None
+    sources: tuple[RubricSourceSpec, ...] = field(default_factory=tuple)
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+def load_evals_config(path: str | os.PathLike[str]) -> dict[str, RubricGenerationSpec]:
+    """Load a YAML evaluators config and return a name -> spec mapping.
+
+    Reads ``path`` (UTF-8) and parses the top-level ``evaluators``
+    mapping into :class:`RubricGenerationSpec` instances keyed by name.
+
+    Requires ``PyYAML``.  Raises :class:`ImportError` with a helpful
+    message when PyYAML is not installed.
+
+    Args:
+        path: Filesystem path to the YAML config.
+
+    Returns:
+        A dict mapping evaluator name to :class:`RubricGenerationSpec`.
+
+    Raises:
+        ImportError: If PyYAML is not installed.
+        ValueError: If the YAML file is malformed.
+    """
+    try:
+        import yaml  # type: ignore[import-untyped]
+    except ImportError as exc:
+        raise ImportError("load_evals_config requires PyYAML.  Install with `pip install pyyaml`.") from exc
+
+    raw = yaml.safe_load(Path(path).read_text(encoding="utf-8"))
+    return parse_evals_config(raw)
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+def parse_evals_config(data: Any) -> dict[str, RubricGenerationSpec]:
+    """Parse an already-loaded YAML mapping into rubric-generation specs.
+
+    Useful when callers manage YAML loading themselves (e.g. CI that
+    interpolates env vars before parsing).
+
+    Args:
+        data: A mapping with an ``"evaluators"`` key containing a mapping
+            of evaluator names to spec dicts.
+
+    Returns:
+        A dict mapping evaluator name to :class:`RubricGenerationSpec`.
+
+    Raises:
+        ValueError: If the structure is malformed.
+    """
+    if not isinstance(data, Mapping):
+        raise ValueError("Evaluators config must be a mapping.")
+    data_map = cast("Mapping[str, Any]", data)
+    raw_evaluators = data_map.get("evaluators")
+    if raw_evaluators is None:
+        raise ValueError("Evaluators config is missing a top-level 'evaluators' key.")
+    if not isinstance(raw_evaluators, Mapping):
+        raise ValueError("Evaluators config 'evaluators' entry must be a mapping.")
+    evaluators = cast("Mapping[str, Any]", raw_evaluators)
+
+    parsed: dict[str, RubricGenerationSpec] = {}
+    for name, raw in evaluators.items():
+        if not isinstance(raw, Mapping):
+            raise ValueError(f"Evaluator entry {name!r} must be a mapping, got {type(raw).__name__}.")
+        raw_map = cast("Mapping[str, Any]", raw)
+        parsed[name] = _parse_spec(name, raw_map)
+    return parsed
+
+
+def _parse_spec(name: str, raw: Mapping[str, Any]) -> RubricGenerationSpec:
+    type_value = raw.get("type", _RUBRIC_TYPE)
+    if type_value != _RUBRIC_TYPE:
+        raise ValueError(f"Evaluator {name!r} has unsupported type {type_value!r}; expected {_RUBRIC_TYPE!r}.")
+    category = raw.get("category", "quality")
+    if category not in ("quality", "safety"):
+        raise ValueError(f"Evaluator {name!r} has invalid category {category!r}; expected 'quality' or 'safety'.")
+
+    raw_sources_obj: Any = raw.get("sources") or ()
+    if not isinstance(raw_sources_obj, (list, tuple)):
+        raise ValueError(f"Evaluator {name!r} 'sources' must be a list.")
+    sources_iter: list[Any] = list(cast("Any", raw_sources_obj))
+    sources: list[RubricSourceSpec] = []
+    for index, raw_source in enumerate(sources_iter):
+        if not isinstance(raw_source, Mapping):
+            raise ValueError(
+                f"Evaluator {name!r} source entry {index} must be a mapping, got {type(raw_source).__name__}."
+            )
+        sources.append(_parse_source(name, index, cast("Mapping[str, Any]", raw_source)))
+
+    return RubricGenerationSpec(
+        name=name,
+        type=type_value,
+        category=category,
+        model=raw.get("model"),
+        agent=raw.get("agent"),
+        workflow=raw.get("workflow"),
+        display_name=raw.get("display_name"),
+        description=raw.get("description"),
+        sources=tuple(sources),
+    )
+
+
+def _parse_source(spec_name: str, index: int, raw: Mapping[str, Any]) -> RubricSourceSpec:
+    type_value = raw.get("type")
+    if type_value not in ("agent", "workflow", "prompt", "dataset", "traces"):
+        raise ValueError(
+            f"Evaluator {spec_name!r} source {index} has invalid type {type_value!r}; "
+            "expected one of 'agent', 'workflow', 'prompt', 'dataset', 'traces'."
+        )
+
+    examples_raw: Any = raw.get("examples") or ()
+    if not isinstance(examples_raw, (list, tuple)):
+        raise ValueError(f"Evaluator {spec_name!r} source {index} 'examples' must be a list.")
+    examples_iter: list[Any] = list(cast("Any", examples_raw))
+    examples = tuple(str(e) for e in examples_iter)
+
+    metadata_raw = raw.get("metadata")
+    if metadata_raw is not None and not isinstance(metadata_raw, Mapping):
+        raise ValueError(f"Evaluator {spec_name!r} source {index} 'metadata' must be a mapping.")
+
+    return RubricSourceSpec(
+        type=cast("Any", type_value),
+        description=raw.get("description"),
+        include_instructions=bool(raw.get("include_instructions", True)),
+        include_tools=bool(raw.get("include_tools", True)),
+        include_context_providers=bool(raw.get("include_context_providers", False)),
+        include_examples=bool(raw.get("include_examples", False)),
+        include_topology=bool(raw.get("include_topology", True)),
+        examples=examples,
+        prompt=raw.get("prompt"),
+        agent_name=raw.get("agent_name"),
+        name=raw.get("name"),
+        version=str(raw.get("version")) if raw.get("version") is not None else None,
+        metadata=dict(cast("Mapping[str, Any]", metadata_raw)) if metadata_raw is not None else None,
+    )
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+def build_sources(
+    spec: RubricGenerationSpec,
+    *,
+    agent: Any | None = None,
+    workflow: Any | None = None,
+) -> list[EvalGenerationSource]:
+    """Translate a spec's source list into :class:`EvalGenerationSource` instances.
+
+    Resolves each :class:`RubricSourceSpec` against the supplied
+    ``agent`` and ``workflow`` instances:
+
+    * ``type='agent'`` sources call :func:`agent_as_eval_source` with
+      the spec's include-flags.  If the source carries an
+      ``agent_name`` the agent is referenced server-side instead.
+    * ``type='workflow'`` sources call
+      :func:`workflow_as_eval_source` with the spec's include-flags.
+    * ``type='prompt'``, ``type='dataset'``, and ``type='traces'``
+      sources are translated directly into
+      :class:`EvalGenerationSource` instances without consulting the
+      runtime agent or workflow.
+
+    When the spec has no ``sources`` entries, defaults to a single
+    ``type='agent'`` source when an ``agent`` is provided, or a single
+    ``type='workflow'`` source when a ``workflow`` is provided.
+
+    Args:
+        spec: Parsed :class:`RubricGenerationSpec`.
+        agent: Optional agent instance for ``type='agent'`` sources.
+        workflow: Optional workflow instance for ``type='workflow'``
+            sources.
+
+    Returns:
+        A list of :class:`EvalGenerationSource` instances ready to pass
+        to :meth:`FoundryEvals.generate_rubric` as ``sources=``.
+
+    Raises:
+        ValueError: If a source references an agent or workflow that
+            was not supplied.
+    """
+    if not spec.sources:
+        if agent is not None:
+            return [agent_as_eval_source(agent)]
+        if workflow is not None:
+            return [workflow_as_eval_source(workflow)]
+        raise ValueError(f"Spec {spec.name!r} has no sources and no agent/workflow was provided to build_sources().")
+
+    out: list[EvalGenerationSource] = []
+    for src in spec.sources:
+        if src.type == "agent":
+            if src.agent_name:
+                out.append(
+                    EvalGenerationSource(
+                        type="agent",
+                        agent_name=src.agent_name,
+                        description=src.description,
+                    )
+                )
+                continue
+            if agent is None:
+                raise ValueError(f"Spec {spec.name!r} has a source of type 'agent' but no agent= was provided.")
+            out.append(
+                agent_as_eval_source(
+                    agent,
+                    include_instructions=src.include_instructions,
+                    include_tools=src.include_tools,
+                    include_context_providers=src.include_context_providers,
+                    include_examples=src.include_examples,
+                    examples=list(src.examples) if src.examples else None,
+                )
+            )
+        elif src.type == "workflow":
+            if workflow is None:
+                raise ValueError(f"Spec {spec.name!r} has a source of type 'workflow' but no workflow= was provided.")
+            out.append(
+                workflow_as_eval_source(
+                    workflow,
+                    include_instructions=src.include_instructions,
+                    include_tools=src.include_tools,
+                    include_context_providers=src.include_context_providers,
+                    include_examples=src.include_examples,
+                    examples=list(src.examples) if src.examples else None,
+                    include_topology=src.include_topology,
+                )
+            )
+        elif src.type == "prompt":
+            if not src.prompt:
+                raise ValueError(f"Spec {spec.name!r} has a 'prompt' source missing the 'prompt' field.")
+            out.append(EvalGenerationSource(type="prompt", prompt=src.prompt, description=src.description))
+        elif src.type == "dataset":
+            if not src.name:
+                raise ValueError(f"Spec {spec.name!r} has a 'dataset' source missing the 'name' field.")
+            out.append(
+                EvalGenerationSource(
+                    type="dataset",
+                    dataset_name=src.name,
+                    dataset_version=src.version,
+                    description=src.description,
+                )
+            )
+        elif src.type == "traces":
+            out.append(
+                EvalGenerationSource(
+                    type="traces",
+                    description=src.description,
+                    metadata=src.metadata,
+                )
+            )
+        else:  # pragma: no cover - guarded by _parse_source
+            raise ValueError(f"Spec {spec.name!r} has unknown source type {src.type!r}.")
+    return out
+
+
+__all__ = [
+    "RubricGenerationSpec",
+    "RubricSourceSpec",
+    "build_sources",
+    "load_evals_config",
+    "parse_evals_config",
+]
diff --git a/python/packages/foundry/tests/test_evals_config.py b/python/packages/foundry/tests/test_evals_config.py
new file mode 100644
index 00000000000..a1c86187d47
--- /dev/null
+++ b/python/packages/foundry/tests/test_evals_config.py
@@ -0,0 +1,273 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Tests for the YAML-driven evaluator configuration loader."""
+
+from __future__ import annotations
+
+import textwrap
+from pathlib import Path
+from typing import Any
+from unittest.mock import MagicMock
+
+import pytest
+
+from agent_framework_foundry._evals_config import (
+    RubricGenerationSpec,
+    RubricSourceSpec,
+    build_sources,
+    load_evals_config,
+    parse_evals_config,
+)
+from agent_framework_foundry._foundry_evals import EvalGenerationSource
+
+
+def _make_agent(name: str = "agent-a", instructions: str = "Be brief.") -> Any:
+    from agent_framework._evaluation import _render_agent_dossier
+
+    agent = MagicMock()
+    agent.name = name
+    agent.description = f"{name} description"
+    agent.default_options = {"instructions": instructions, "tools": []}
+    agent.context_providers = []
+    agent.mcp_tools = []
+    agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier(
+        agent,
+        include_instructions=kw.get("include_instructions", True),
+        include_tools=kw.get("include_tools", True),
+        include_context_providers=kw.get("include_context_providers", False),
+        include_examples=kw.get("include_examples", False),
+        examples=kw.get("examples"),
+    )
+    return agent
+
+
+def _make_workflow() -> Any:
+    from agent_framework._evaluation import _render_workflow_dossier
+
+    workflow = MagicMock()
+    workflow.name = "wf-1"
+    workflow.description = "demo"
+    workflow.to_dict.return_value = {"name": "wf-1", "id": "wf_1", "executors": {}, "edge_groups": []}
+    workflow.executors = {}
+    workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier(
+        workflow,
+        include_instructions=kw.get("include_instructions", True),
+        include_tools=kw.get("include_tools", True),
+        include_context_providers=kw.get("include_context_providers", False),
+        include_examples=kw.get("include_examples", False),
+        examples=kw.get("examples"),
+        include_topology=kw.get("include_topology", True),
+    )
+    return workflow
+
+
+class TestParseEvalsConfig:
+    """Parsing already-loaded dicts into RubricGenerationSpec instances."""
+
+    def test_minimal_spec(self) -> None:
+        config = parse_evals_config({
+            "evaluators": {
+                "my-rubric": {
+                    "type": "foundry.generated_rubric",
+                }
+            }
+        })
+        assert "my-rubric" in config
+        spec = config["my-rubric"]
+        assert spec.name == "my-rubric"
+        assert spec.type == "foundry.generated_rubric"
+        assert spec.category == "quality"
+        assert spec.sources == ()
+
+    def test_full_spec_with_sources(self) -> None:
+        config = parse_evals_config({
+            "evaluators": {
+                "reservation-quality": {
+                    "type": "foundry.generated_rubric",
+                    "category": "quality",
+                    "model": "gpt-4o",
+                    "agent": "reservation-agent",
+                    "display_name": "Reservation Quality",
+                    "description": "Custom rubric for reservation agent.",
+                    "sources": [
+                        {
+                            "type": "agent",
+                            "include_instructions": True,
+                            "include_tools": True,
+                            "include_context_providers": True,
+                        },
+                        {
+                            "type": "dataset",
+                            "name": "reservation-business-rules",
+                            "version": 1,
+                        },
+                    ],
+                }
+            }
+        })
+        spec = config["reservation-quality"]
+        assert spec.model == "gpt-4o"
+        assert spec.agent == "reservation-agent"
+        assert spec.display_name == "Reservation Quality"
+        assert len(spec.sources) == 2
+
+        agent_src = spec.sources[0]
+        assert agent_src.type == "agent"
+        assert agent_src.include_context_providers is True
+
+        dataset_src = spec.sources[1]
+        assert dataset_src.type == "dataset"
+        assert dataset_src.name == "reservation-business-rules"
+        assert dataset_src.version == "1"  # coerced to string
+
+    def test_rejects_non_mapping(self) -> None:
+        with pytest.raises(ValueError, match="must be a mapping"):
+            parse_evals_config([])
+
+    def test_rejects_missing_evaluators_key(self) -> None:
+        with pytest.raises(ValueError, match="evaluators"):
+            parse_evals_config({"other": {}})
+
+    def test_rejects_unknown_type(self) -> None:
+        with pytest.raises(ValueError, match="unsupported type"):
+            parse_evals_config({"evaluators": {"x": {"type": "foundry.other"}}})
+
+    def test_rejects_invalid_category(self) -> None:
+        with pytest.raises(ValueError, match="invalid category"):
+            parse_evals_config({"evaluators": {"x": {"type": "foundry.generated_rubric", "category": "bogus"}}})
+
+    def test_rejects_invalid_source_type(self) -> None:
+        with pytest.raises(ValueError, match="invalid type"):
+            parse_evals_config({
+                "evaluators": {
+                    "x": {
+                        "type": "foundry.generated_rubric",
+                        "sources": [{"type": "bogus"}],
+                    }
+                }
+            })
+
+
+class TestLoadEvalsConfig:
+    """End-to-end YAML loading."""
+
+    def test_load_from_yaml_file(self, tmp_path: Path) -> None:
+        pytest.importorskip("yaml")
+        config_path = tmp_path / "evals.yaml"
+        config_path.write_text(
+            textwrap.dedent(
+                """\
+                evaluators:
+                  my-eval:
+                    type: foundry.generated_rubric
+                    category: safety
+                    model: gpt-4o-mini
+                    sources:
+                      - type: prompt
+                        prompt: "Score the response."
+                """
+            ),
+            encoding="utf-8",
+        )
+        config = load_evals_config(config_path)
+        assert "my-eval" in config
+        spec = config["my-eval"]
+        assert spec.category == "safety"
+        assert spec.model == "gpt-4o-mini"
+        assert len(spec.sources) == 1
+        assert spec.sources[0].type == "prompt"
+        assert spec.sources[0].prompt == "Score the response."
+
+
+class TestBuildSources:
+    """Translate RubricGenerationSpec sources into EvalGenerationSource instances."""
+
+    def test_no_sources_with_agent_default(self) -> None:
+        spec = RubricGenerationSpec(name="x")
+        agent = _make_agent()
+        sources = build_sources(spec, agent=agent)
+        assert len(sources) == 1
+        assert sources[0].type == "prompt"
+        assert sources[0].prompt is not None
+        assert "Agent name: agent-a" in sources[0].prompt
+
+    def test_no_sources_with_workflow_default(self) -> None:
+        spec = RubricGenerationSpec(name="x")
+        workflow = _make_workflow()
+        sources = build_sources(spec, workflow=workflow)
+        assert len(sources) == 1
+        assert sources[0].type == "prompt"
+        assert sources[0].prompt is not None
+        assert "Workflow name: wf-1" in sources[0].prompt
+
+    def test_no_sources_no_agent_or_workflow_raises(self) -> None:
+        spec = RubricGenerationSpec(name="x")
+        with pytest.raises(ValueError, match="no sources"):
+            build_sources(spec)
+
+    def test_agent_source_uses_supplied_agent(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="agent", include_context_providers=True),),
+        )
+        agent = _make_agent()
+        sources = build_sources(spec, agent=agent)
+        assert sources[0].type == "prompt"
+        assert sources[0].prompt is not None
+        assert "Agent name: agent-a" in sources[0].prompt
+
+    def test_agent_source_with_agent_name_uses_hosted_path(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="agent", agent_name="hosted-foundry-agent"),),
+        )
+        sources = build_sources(spec)
+        assert sources[0].type == "agent"
+        assert sources[0].agent_name == "hosted-foundry-agent"
+
+    def test_agent_source_without_agent_raises(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="agent"),),
+        )
+        with pytest.raises(ValueError, match="no agent="):
+            build_sources(spec)
+
+    def test_workflow_source_uses_supplied_workflow(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="workflow", include_topology=False),),
+        )
+        workflow = _make_workflow()
+        sources = build_sources(spec, workflow=workflow)
+        assert sources[0].type == "prompt"
+        assert sources[0].prompt is not None
+        assert "Workflow name: wf-1" in sources[0].prompt
+        assert "Topology (JSON):" not in sources[0].prompt
+
+    def test_prompt_source_translates_directly(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="prompt", prompt="Score it."),),
+        )
+        sources = build_sources(spec)
+        assert sources[0] == EvalGenerationSource(type="prompt", prompt="Score it.")
+
+    def test_dataset_source_translates(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="dataset", name="ds", version="2"),),
+        )
+        sources = build_sources(spec)
+        assert sources[0].type == "dataset"
+        assert sources[0].dataset_name == "ds"
+        assert sources[0].dataset_version == "2"
+
+    def test_traces_source_passes_metadata(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="traces", metadata={"environment": "prod"}),),
+        )
+        sources = build_sources(spec)
+        assert sources[0].type == "traces"
+        assert sources[0].metadata == {"environment": "prod"}
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py
new file mode 100644
index 00000000000..9c19ff552ba
--- /dev/null
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py
@@ -0,0 +1,151 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Generate a Foundry rubric evaluator from an agent and use it in CI.
+
+This sample demonstrates the end-to-end adaptive-evals flow:
+
+1. Build an agent.
+2. Generate a rubric evaluator from the agent using
+   ``FoundryEvals.generate_rubric()`` — produces a pinned
+   ``GeneratedEvaluatorRef`` you can store in source control.
+3. Use the pinned reference in ``evaluators=[...]`` for a regression
+   run alongside built-in evaluators.
+4. Assert quality gates with ``assert_score_at_least`` /
+   ``assert_dimension_score_at_least`` / ``assert_no_failed_items``.
+
+A companion ``evaluators.yaml`` shows the source-controlled config
+pattern for CI.  Load it with :func:`load_evals_config` and pass the
+resulting spec through :func:`build_sources` to keep generation
+parameters out of code.
+
+Prerequisites:
+- An Azure AI Foundry project with a deployed model.
+- ``azure-ai-projects`` build that includes the rubric-generation APIs.
+- Set ``FOUNDRY_PROJECT_ENDPOINT`` and ``FOUNDRY_MODEL`` in ``.env``.
+
+Run with:
+
+.. code-block:: bash
+
+    az login
+    python evaluate_with_generated_rubric_sample.py
+"""
+
+import asyncio
+import os
+import textwrap
+from pathlib import Path
+
+from agent_framework import evaluate_agent
+from agent_framework.foundry import (
+    FoundryChatClient,
+    FoundryEvals,
+    build_sources,
+    load_evals_config,
+)
+from azure.ai.projects.aio import AIProjectClient
+from azure.identity.aio import AzureCliCredential
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+def get_weather(location: str) -> str:
+    """Get the current weather for a location."""
+    samples = {
+        "seattle": "62F, cloudy with a chance of rain",
+        "london": "55F, overcast",
+        "paris": "68F, partly sunny",
+    }
+    return samples.get(location.lower(), f"Weather data not available for {location}")
+
+
+SAMPLE_YAML = textwrap.dedent(
+    """\
+    evaluators:
+      travel-quality:
+        type: foundry.generated_rubric
+        category: quality
+        model: gpt-4o
+        display_name: Travel Quality Rubric
+        description: Custom rubric tailored to the travel-assistant agent.
+        sources:
+          - type: agent
+            include_instructions: true
+            include_tools: true
+    """
+)
+
+
+async def main() -> None:
+    project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
+    model_name = os.environ.get("FOUNDRY_MODEL", "gpt-4o")
+
+    credential = AzureCliCredential()
+    chat_client = FoundryChatClient(
+        project_endpoint=project_endpoint,
+        model=model_name,
+        credential=credential,
+    )
+    project_client = AIProjectClient(endpoint=project_endpoint, credential=credential)
+
+    agent = chat_client.as_agent(
+        name="travel-assistant",
+        instructions=(
+            "You are a helpful travel assistant.  Always ground recommendations in tool output, "
+            "cite each tool result, and refuse questions outside travel planning."
+        ),
+        tools=[get_weather],
+    )
+
+    # 1. Load the source-controlled evaluator config.
+    config_path = Path(__file__).with_name("evaluators.yaml")
+    if not config_path.exists():
+        config_path.write_text(SAMPLE_YAML, encoding="utf-8")
+        print(f"Wrote sample config to {config_path}")
+    config = load_evals_config(config_path)
+    spec = config["travel-quality"]
+
+    # 2. Generate (or refresh) the rubric evaluator.  In CI you typically run
+    # this once and commit the returned name/version pair.
+    print("Generating rubric evaluator from agent + spec...")
+    sources = build_sources(spec, agent=agent)
+    rubric_ref = await FoundryEvals.generate_rubric(
+        project_client=project_client,
+        name=spec.name,
+        sources=sources,
+        category=spec.category,
+        model=spec.model,
+        display_name=spec.display_name,
+        description=spec.description,
+    )
+    print(f"Generated rubric {rubric_ref.name}@{rubric_ref.version} with {len(rubric_ref.dimensions or ())} dimensions")
+
+    # 3. Run an evaluation that combines built-ins with the new rubric.
+    evals = FoundryEvals(
+        client=chat_client,
+        evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY, rubric_ref],
+    )
+    results = await evaluate_agent(
+        agent=agent,
+        queries=[
+            "What's the weather in Seattle?",
+            "Should I pack an umbrella for London?",
+        ],
+        evaluators=evals,
+    )
+
+    # 4. Quality gates — wire these into your CI job's exit status.
+    for r in results:
+        print(f"\nRun {r.run_id}: {r.passed}/{r.total} passed; portal: {r.report_url}")
+        r.assert_no_failed_items()
+        r.assert_score_at_least(0.8)
+        if rubric_ref.dimensions:
+            r.assert_dimension_score_at_least(rubric_ref.dimensions[0].id, 3)
+
+    await project_client.close()
+    await credential.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml
new file mode 100644
index 00000000000..f3e698c77ce
--- /dev/null
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml
@@ -0,0 +1,11 @@
+evaluators:
+  travel-quality:
+    type: foundry.generated_rubric
+    category: quality
+    model: gpt-4o
+    display_name: Travel Quality Rubric
+    description: Custom rubric tailored to the travel-assistant agent.
+    sources:
+      - type: agent
+        include_instructions: true
+        include_tools: true

From 276fb769abe2d8832c16a0c3aafb9610adda896c Mon Sep 17 00:00:00 2001
From: Ben Thomas <25218250+alliscode@users.noreply.github.com>
Date: Wed, 27 May 2026 08:48:15 -0700
Subject: [PATCH 07/16] Python: fix(evals): address PR review feedback

Addresses 4 Copilot review comments on PR #6101:

1. assert_dimension_score_at_least: drop the (not evaluator or found_any) guard so require_applicable=True correctly raises when the named evaluator produces no entries for the dimension. Adds TestRubricAssertions covering the regression.

2. GeneratedEvaluatorRef docstring: reword to describe actual behaviour (pinning recommended, not required) so it matches the dataclass default and FoundryEvals warning path.

3. _poll_generation_job: switch from asyncio.get_event_loop() to get_running_loop() and bound the per-iteration sleep by remaining time, matching _poll_eval_run.

4. generate_rubric: type category as Literal['quality','safety'] and validate at the entry point with a ValueError; drop the silent 'invalid -> quality' rewrite in _generation_job_to_ref. Adds a regression test.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/agent_framework/_evaluation.py       |   4 +-
 .../core/tests/core/test_local_eval.py        | 106 ++++++++++++++++++
 .../agent_framework_foundry/_foundry_evals.py |  28 +++--
 .../foundry/tests/test_foundry_evals.py       |  16 ++-
 4 files changed, 138 insertions(+), 16 deletions(-)

diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py
index 48704d3543c..b14bdee9b22 100644
--- a/python/packages/core/agent_framework/_evaluation.py
+++ b/python/packages/core/agent_framework/_evaluation.py
@@ -577,7 +577,6 @@ def assert_dimension_score_at_least(
         def _check(results: EvalResults) -> None:
             for item in results.items:
                 found_applicable = False
-                found_any = False
                 for score in item.scores:
                     if evaluator is not None and score.name != evaluator:
                         continue
@@ -586,7 +585,6 @@ def _check(results: EvalResults) -> None:
                     for rs in score.dimensions:
                         if rs.id != dimension_id:
                             continue
-                        found_any = True
                         if not rs.applicable:
                             continue
                         found_applicable = True
@@ -595,7 +593,7 @@ def _check(results: EvalResults) -> None:
                                 f"{item.item_id}/{score.name}/{dimension_id}="
                                 f"{rs.score if rs.score is not None else 'None'}"
                             )
-                if require_applicable and not found_applicable and (not evaluator or found_any):
+                if require_applicable and not found_applicable:
                     missing_items.append(item.item_id)
             for sub in results.sub_results.values():
                 _check(sub)
diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py
index c13b107c4bd..e4c37dfb4b4 100644
--- a/python/packages/core/tests/core/test_local_eval.py
+++ b/python/packages/core/tests/core/test_local_eval.py
@@ -12,8 +12,13 @@
 from agent_framework._evaluation import (
     CheckResult,
     EvalItem,
+    EvalItemResult,
+    EvalNotPassedError,
+    EvalResults,
+    EvalScoreResult,
     ExpectedToolCall,
     LocalEvaluator,
+    RubricScore,
     _coerce_result,
     evaluator,
     keyword_check,
@@ -1010,6 +1015,107 @@ def test_all_passed_parent_fails_when_own_counts_fail(self):
         assert parent.all_passed is False
 
 
+# ---------------------------------------------------------------------------
+# Rubric assertions (EvalResults.assert_*)
+# ---------------------------------------------------------------------------
+
+
+def _rubric_results(*scores_per_item: list[EvalScoreResult]) -> EvalResults:
+    items = [
+        EvalItemResult(item_id=f"item-{i}", status="pass", scores=scores) for i, scores in enumerate(scores_per_item)
+    ]
+    return EvalResults(
+        provider="test",
+        eval_id="ev1",
+        run_id="run1",
+        result_counts={"passed": len(items), "failed": 0, "errored": 0, "total": len(items)},
+        items=items,
+    )
+
+
+class TestRubricAssertions:
+    """Tests for EvalResults.assert_dimension_score_at_least."""
+
+    def test_dimension_at_or_above_threshold_passes(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="policy",
+                    score=0.9,
+                    dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")],
+                )
+            ],
+        )
+        # Should not raise.
+        results.assert_dimension_score_at_least("clarity", 3)
+
+    def test_dimension_below_threshold_raises(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="policy",
+                    score=0.5,
+                    dimensions=[RubricScore(id="clarity", score=2, applicable=True, weight=1, reason="")],
+                )
+            ],
+        )
+        with pytest.raises(EvalNotPassedError):
+            results.assert_dimension_score_at_least("clarity", 3)
+
+    def test_non_applicable_skipped_by_default(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="policy",
+                    score=1.0,
+                    dimensions=[RubricScore(id="clarity", score=None, applicable=False, weight=1, reason="n/a")],
+                )
+            ],
+        )
+        # No applicable scores; default behaviour is to skip silently.
+        results.assert_dimension_score_at_least("clarity", 3)
+
+    def test_require_applicable_raises_when_dimension_absent(self) -> None:
+        results = _rubric_results(
+            [EvalScoreResult(name="policy", score=1.0, dimensions=[])],
+        )
+        with pytest.raises(EvalNotPassedError, match="not applicable"):
+            results.assert_dimension_score_at_least("clarity", 3, require_applicable=True)
+
+    def test_require_applicable_raises_when_filtered_evaluator_missing(self) -> None:
+        # Regression: previously the (not evaluator or found_any) guard caused
+        # this case to silently pass even with require_applicable=True.
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="other",
+                    score=0.9,
+                    dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")],
+                )
+            ],
+        )
+        with pytest.raises(EvalNotPassedError, match="not applicable"):
+            results.assert_dimension_score_at_least("clarity", 3, evaluator="policy", require_applicable=True)
+
+    def test_evaluator_filter_isolates_offenders(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="other",
+                    score=0.1,
+                    dimensions=[RubricScore(id="clarity", score=1, applicable=True, weight=1, reason="")],
+                ),
+                EvalScoreResult(
+                    name="policy",
+                    score=0.9,
+                    dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")],
+                ),
+            ],
+        )
+        # The low-scoring "other" evaluator is filtered out; "policy" passes.
+        results.assert_dimension_score_at_least("clarity", 3, evaluator="policy")
+
+
 # ---------------------------------------------------------------------------
 # Eval source rendering (string dossiers)
 # ---------------------------------------------------------------------------
diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index 0d83d8b1bc3..5241c4d268f 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -96,9 +96,11 @@ class GeneratedEvaluatorRef:
     when the evaluator already exists, or obtain one from
     :meth:`FoundryEvals.generate_rubric`.
 
-    By default ``version`` is required and pinned so an evaluation run is
-    reproducible.  Use :meth:`latest` to opt in to versionless references
-    explicitly.
+    Pinning ``version`` is strongly recommended so evaluation runs are
+    reproducible.  The dataclass accepts ``version=None`` for the
+    convenience of :meth:`latest`, but ``FoundryEvals`` emits a warning
+    whenever a versionless reference is used; CI gates should always
+    pass a concrete version.
 
     Attributes:
         name: Evaluator name as stored in the Foundry project (e.g.
@@ -1147,7 +1149,7 @@ async def generate_rubric(
         agent: BaseAgent | None = None,
         workflow: Workflow | None = None,
         sources: Sequence[EvalGenerationSource] | None = None,
-        category: str = "quality",
+        category: Literal["quality", "safety"] = "quality",
         model: str | None = None,
         display_name: str | None = None,
         description: str | None = None,
@@ -1209,6 +1211,9 @@ async def generate_rubric(
         """
         resolved_sources = _coalesce_generation_sources(agent=agent, workflow=workflow, sources=sources)
 
+        if category not in ("quality", "safety"):
+            raise ValueError(f"category must be 'quality' or 'safety', got {category!r}.")
+
         try:
             sdk_types = _import_generation_sdk_types()
         except _RubricSdkUnavailableError as exc:
@@ -1389,7 +1394,8 @@ async def _poll_generation_job(
     if not job_id:
         raise RuntimeError("Rubric generation job did not return an id.")
 
-    deadline = asyncio.get_event_loop().time() + timeout
+    loop = asyncio.get_running_loop()
+    deadline = loop.time() + timeout
     current = job
     while True:
         status = (getattr(current, "status", "") or "").lower()
@@ -1399,15 +1405,16 @@ async def _poll_generation_job(
                 err_msg = getattr(err, "message", None) or str(err) if err is not None else status
                 raise RuntimeError(f"Rubric generation job {job_id} ended in status {status!r}: {err_msg}")
             return current
-        if asyncio.get_event_loop().time() >= deadline:
+        remaining = deadline - loop.time()
+        if remaining <= 0:
             raise TimeoutError(
                 f"Rubric generation job {job_id} did not complete within {timeout}s (last status: {status!r})."
             )
-        await asyncio.sleep(poll_interval)
+        await asyncio.sleep(min(poll_interval, remaining))
         current = await evaluators_ops.get_generation_job(job_id)
 
 
-def _generation_job_to_ref(job: Any, *, category: str) -> GeneratedEvaluatorRef:
+def _generation_job_to_ref(job: Any, *, category: Literal["quality", "safety"]) -> GeneratedEvaluatorRef:
     """Build a pinned :class:`GeneratedEvaluatorRef` from a completed job."""
     artifacts: Any = getattr(job, "artifacts", None)
     evaluator: Any = getattr(artifacts, "evaluator", None) if artifacts is not None else None
@@ -1447,13 +1454,10 @@ def _generation_job_to_ref(job: Any, *, category: str) -> GeneratedEvaluatorRef:
         if isinstance(raw_threshold, (int, float)):
             pass_threshold = float(raw_threshold)
 
-    valid_category: str
-    valid_category = category if category in ("quality", "safety") else "quality"
-
     return GeneratedEvaluatorRef(
         name=str(ev_name),
         version=str(ev_version),
-        category=cast("Any", valid_category),
+        category=category,
         display_name=getattr(evaluator, "display_name", None),
         description=getattr(evaluator, "description", None),
         dimensions=dimensions,
diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index 16dc9d50ce7..aee819eee71 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -6,7 +6,7 @@
 
 import json
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, cast
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
@@ -3305,6 +3305,20 @@ def _raise() -> Any:
                 sources=[EvalGenerationSource(type="prompt", prompt="hi")],
             )
 
+    async def test_raises_value_error_on_invalid_category(self) -> None:
+        """category outside {quality, safety} should fail fast at the boundary."""
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource
+
+        project_client = MagicMock()
+
+        with pytest.raises(ValueError, match="category"):
+            await FoundryEvals.generate_rubric(
+                project_client=project_client,
+                name="my-eval",
+                sources=[EvalGenerationSource(type="prompt", prompt="hi")],
+                category=cast("Any", "invalid"),
+            )
+
 
 class TestGenerateRubricE2E:
     """End-to-end happy path for generate_rubric with mocked SDK."""

From 9a2c96404aaa9b811a15d7bde8a07eb78e22bf55 Mon Sep 17 00:00:00 2001
From: Ben Thomas <25218250+alliscode@users.noreply.github.com>
Date: Wed, 27 May 2026 14:35:17 -0700
Subject: [PATCH 08/16] Python: feat(foundry-evals): hosted-agent-aware rubric
 generation

* Auto-detect hosted Foundry agents in agent_as_eval_source: when the
  agent's chat_client exposes a string agent_name (the convention used
  by RawFoundryAgentChatClient for PromptAgents/HostedAgents), emit a
  type='agent' EvalGenerationSource so the service fetches instructions
  and tools from the agent registry instead of relying on the local
  wrapper (which holds neither for hosted agents).
* Add hosted_agent_version kwarg and a new agent_version field on
  EvalGenerationSource so PromptAgent runs can pin to a specific hosted
  version for reproducible rubric generation.
* Add force_prompt_source escape hatch to bypass auto-detection and
  always emit a rendered prompt dossier - useful when the local wrapper
  carries overrides the service-side agent doesnt see.
* Fix _to_sdk_source for dataset sources: SDK ctor takes name=/version=,
  not dataset_name=/dataset_version=. The mismatch would raise TypeError
  against the real azure-ai-projects 2.3.0a* SDK; only unmocked
  integration paths were affected.

Tests cover: auto-detection happy path, versionless hosted agent,
explicit hosted_agent_version forwarding, force_prompt_source override,
non-string chat_client attrs (MagicMock test doubles) not mis-detected,
agent_version forwarded through _to_sdk_source, and the corrected
dataset SDK kwarg names.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../agent_framework_foundry/_foundry_evals.py | 106 ++++++++++++++----
 .../foundry/tests/test_foundry_evals.py       |  83 +++++++++++++-
 2 files changed, 168 insertions(+), 21 deletions(-)

diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index 5241c4d268f..2b8d7913e08 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -183,6 +183,9 @@ class EvalGenerationSource:
         description: Optional short description shown in Foundry UI.
         prompt: Rendered dossier for ``type="prompt"`` sources.
         agent_name: Hosted Foundry agent name for ``type="agent"`` sources.
+        agent_version: Optional pinned hosted-agent version for
+            ``type="agent"`` sources.  ``None`` resolves to the latest
+            version at generation time; pin for reproducible runs.
         dataset_name: Foundry dataset name for ``type="dataset"`` sources.
         dataset_version: Pinned dataset version (recommended for repro).
         metadata: Free-form metadata.  Used by ``type="traces"`` sources
@@ -194,6 +197,7 @@ class EvalGenerationSource:
     description: str | None = None
     prompt: str | None = None
     agent_name: str | None = None
+    agent_version: str | None = None
     dataset_name: str | None = None
     dataset_version: str | None = None
     metadata: dict[str, Any] | None = None
@@ -209,41 +213,79 @@ def agent_as_eval_source(
     include_examples: bool = False,
     examples: Sequence[str] | None = None,
     hosted_agent_name: str | None = None,
+    hosted_agent_version: str | None = None,
+    force_prompt_source: bool = False,
 ) -> EvalGenerationSource:
     """Render an agent as an :class:`EvalGenerationSource` for rubric generation.
 
-    Wraps :meth:`BaseAgent.as_eval_source` to package the agent's
-    rendered dossier into a typed Foundry generation source.  When
-    ``hosted_agent_name`` is provided, returns a ``type="agent"`` source
-    referencing the hosted Foundry agent so the service fetches
-    server-side metadata directly instead of using a rendered dossier.
+    Picks the best Foundry source variant for the supplied agent:
+
+    * **Hosted Foundry agents** (``FoundryAgent`` connected to a Prompt
+      Agent or Hosted Agent in a Foundry project) are emitted as
+      ``type="agent"`` sources keyed by ``agent_name`` so the service
+      fetches instructions, tools, and metadata directly from the agent
+      registry — independent of whatever the local wrapper happens to
+      hold.  Detected automatically from ``agent.chat_client.agent_name``
+      and ``agent.chat_client.agent_version``.
+    * **Local agents** (any other ``BaseAgent`` whose instructions and
+      tools live client-side, e.g. ``FoundryChatClient``-backed agents or
+      pure OpenAI Responses agents) are emitted as ``type="prompt"``
+      sources with a rendered text dossier.
+
+    Override the heuristic by passing ``hosted_agent_name`` explicitly
+    (forces an ``"agent"`` source) or ``force_prompt_source=True``
+    (forces a ``"prompt"`` source — useful when you want the service to
+    score a hosted agent against the *local* wrapper's overrides).
 
     Args:
         agent: Agent instance (typically a ``BaseAgent`` subclass).
         include_instructions: Whether to include the agent's instructions
-            text.  Defaults to ``True``.
-        include_tools: Whether to include tool definitions.  Defaults to
+            text in the dossier (``"prompt"`` sources only).  Defaults to
             ``True``.
+        include_tools: Whether to include tool definitions in the dossier
+            (``"prompt"`` sources only).  Defaults to ``True``.
         include_context_providers: Whether to include the names of
-            attached context-provider classes.  Defaults to ``False`` to
-            avoid leaking implementation details.
-        include_examples: Whether to include the supplied ``examples``.
-            Defaults to ``False`` to avoid shipping potentially sensitive
-            sample inputs by default.
+            attached context-provider classes in the dossier
+            (``"prompt"`` sources only).  Defaults to ``False`` to avoid
+            leaking implementation details.
+        include_examples: Whether to include the supplied ``examples`` in
+            the dossier (``"prompt"`` sources only).  Defaults to
+            ``False`` to avoid shipping potentially sensitive sample
+            inputs by default.
         examples: Optional sample queries / interactions to include when
             ``include_examples`` is ``True``.
         hosted_agent_name: When set, emit a ``type="agent"`` source
-            referencing the hosted Foundry agent by name instead of a
-            rendered dossier.
+            referencing this hosted Foundry agent name regardless of
+            auto-detection.  Use to override or supplement the
+            heuristic.
+        hosted_agent_version: When set together with a hosted-agent
+            source, pins the source to a specific hosted-agent version.
+            Recommended for reproducible rubric generation against
+            PromptAgents.
+        force_prompt_source: When ``True``, always emit a
+            ``type="prompt"`` source with the rendered dossier even when
+            the agent is a hosted Foundry agent.  Useful when the local
+            wrapper holds overrides the service-side agent doesn't see.
 
     Returns:
         An :class:`EvalGenerationSource` describing the agent.
     """
-    if hosted_agent_name:
-        agent_description = getattr(agent, "description", None)
+    agent_description = getattr(agent, "description", None)
+
+    resolved_name = hosted_agent_name
+    resolved_version = hosted_agent_version
+    if resolved_name is None and not force_prompt_source:
+        detected_name, detected_version = _detect_hosted_foundry_agent(agent)
+        if detected_name is not None:
+            resolved_name = detected_name
+            if resolved_version is None:
+                resolved_version = detected_version
+
+    if resolved_name is not None and not force_prompt_source:
         return EvalGenerationSource(
             type="agent",
-            agent_name=hosted_agent_name,
+            agent_name=resolved_name,
+            agent_version=resolved_version,
             description=agent_description,
         )
 
@@ -254,7 +296,6 @@ def agent_as_eval_source(
         include_examples=include_examples,
         examples=examples,
     )
-    agent_description = getattr(agent, "description", None)
     return EvalGenerationSource(
         type="prompt",
         prompt=prompt,
@@ -262,6 +303,28 @@ def agent_as_eval_source(
     )
 
 
+def _detect_hosted_foundry_agent(agent: BaseAgent) -> tuple[str | None, str | None]:
+    """Return ``(agent_name, agent_version)`` for hosted Foundry agents, else ``(None, None)``.
+
+    A hosted Foundry agent is one whose ``chat_client`` exposes a string
+    ``agent_name`` — the convention used by ``RawFoundryAgentChatClient``
+    when ``FoundryAgent`` connects to an existing Prompt Agent or Hosted
+    Agent in a Foundry project.  Only string values are accepted so
+    test doubles using ``MagicMock`` for ``chat_client`` are not
+    mis-detected.
+    """
+    chat_client = getattr(agent, "chat_client", None)
+    if chat_client is None:
+        return None, None
+    name = getattr(chat_client, "agent_name", None)
+    version = getattr(chat_client, "agent_version", None)
+    if not isinstance(name, str) or not name:
+        return None, None
+    if not isinstance(version, str) or not version:
+        version = None
+    return name, version
+
+
 @experimental(feature_id=ExperimentalFeature.EVALS)
 def workflow_as_eval_source(
     workflow: Workflow,
@@ -1354,6 +1417,8 @@ def _to_sdk_source(source: EvalGenerationSource, sdk_types: _GenerationSdkTypes)
         if not source.agent_name:
             raise ValueError("EvalGenerationSource(type='agent') requires agent_name.")
         kwargs = {"agent_name": source.agent_name}
+        if source.agent_version is not None:
+            kwargs["agent_version"] = source.agent_version
         if source.description is not None:
             kwargs["description"] = source.description
         return sdk_types.AgentSource(**kwargs)
@@ -1364,9 +1429,10 @@ def _to_sdk_source(source: EvalGenerationSource, sdk_types: _GenerationSdkTypes)
             )
         if not source.dataset_name:
             raise ValueError("EvalGenerationSource(type='dataset') requires dataset_name.")
-        kwargs = {"dataset_name": source.dataset_name}
+        # SDK uses ``name`` / ``version`` (not ``dataset_name`` / ``dataset_version``).
+        kwargs = {"name": source.dataset_name}
         if source.dataset_version is not None:
-            kwargs["dataset_version"] = source.dataset_version
+            kwargs["version"] = source.dataset_version
         if source.description is not None:
             kwargs["description"] = source.description
         return sdk_types.DatasetSource(**kwargs)
diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index aee819eee71..7244347e05b 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -3020,6 +3020,75 @@ def test_hosted_agent_name_emits_agent_source(self) -> None:
         assert source.prompt is None
         assert source.description == "Looks up the weather."
 
+    def test_explicit_hosted_agent_version_forwarded(self) -> None:
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot")
+        source = agent_as_eval_source(
+            agent,
+            hosted_agent_name="weather-bot-hosted-id",
+            hosted_agent_version="3",
+        )
+        assert source.type == "agent"
+        assert source.agent_name == "weather-bot-hosted-id"
+        assert source.agent_version == "3"
+
+    def test_auto_detects_hosted_foundry_agent(self) -> None:
+        """A chat_client carrying agent_name/agent_version is treated as a hosted agent."""
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
+        agent.chat_client = MagicMock()
+        agent.chat_client.agent_name = "weather-prompt-agent"
+        agent.chat_client.agent_version = "2"
+
+        source = agent_as_eval_source(agent)
+        assert source.type == "agent"
+        assert source.agent_name == "weather-prompt-agent"
+        assert source.agent_version == "2"
+        assert source.prompt is None
+        assert source.description == "Looks up the weather."
+
+    def test_auto_detection_handles_versionless_hosted_agent(self) -> None:
+        """HostedAgents typically omit agent_version (no None forwarded)."""
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot")
+        agent.chat_client = MagicMock()
+        agent.chat_client.agent_name = "weather-hosted-agent"
+        agent.chat_client.agent_version = None
+
+        source = agent_as_eval_source(agent)
+        assert source.type == "agent"
+        assert source.agent_name == "weather-hosted-agent"
+        assert source.agent_version is None
+
+    def test_force_prompt_source_overrides_auto_detection(self) -> None:
+        """force_prompt_source=True falls back to dossier even for hosted agents."""
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
+        agent.chat_client = MagicMock()
+        agent.chat_client.agent_name = "weather-prompt-agent"
+        agent.chat_client.agent_version = "2"
+
+        source = agent_as_eval_source(agent, force_prompt_source=True)
+        assert source.type == "prompt"
+        assert source.prompt is not None
+        assert "Agent name: weather-bot" in source.prompt
+
+    def test_auto_detection_ignores_non_string_chat_client_fields(self) -> None:
+        """Bare MagicMock chat_client (untyped attrs) must not trigger detection."""
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="local-agent")
+        agent.chat_client = MagicMock()  # agent_name attr resolves to a MagicMock, not a str
+
+        source = agent_as_eval_source(agent)
+        assert source.type == "prompt"
+        assert source.prompt is not None
+        assert "Agent name: local-agent" in source.prompt
+
     def test_forwards_keyword_options_to_agent(self) -> None:
         from agent_framework_foundry._foundry_evals import agent_as_eval_source
 
@@ -3192,7 +3261,19 @@ def test_dataset_source_is_translated(self) -> None:
             sdk,
         )
         assert out == "dataset-sdk-instance"
-        sdk.DatasetSource.assert_called_once_with(dataset_name="ds", dataset_version="1")
+        sdk.DatasetSource.assert_called_once_with(name="ds", version="1")
+
+    def test_agent_source_forwards_agent_version(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        sdk.AgentSource.return_value = "agent-sdk-instance"
+        out = _to_sdk_source(
+            EvalGenerationSource(type="agent", agent_name="prompt-agent", agent_version="2"),
+            sdk,
+        )
+        assert out == "agent-sdk-instance"
+        sdk.AgentSource.assert_called_once_with(agent_name="prompt-agent", agent_version="2")
 
 
 class TestPollGenerationJob:

From 31f81078243de9eb79584c037dec881da3d36204 Mon Sep 17 00:00:00 2001
From: alliscode <25218250+alliscode@users.noreply.github.com>
Date: Thu, 28 May 2026 09:12:57 -0700
Subject: [PATCH 09/16] fix(foundry-evals): accept canonical dimension_scores
 key per docs

The published Foundry rubric-evaluator output (Microsoft Learn 'Rubric evaluators' reference) places per-dimension breakdowns under properties.dimension_scores, not properties.rubric_scores. The parser now tries dimension_scores first and falls back to rubric_scores for preview-build compatibility, and tolerates non-list payloads (e.g. MagicMock auto-attrs) by trying the next candidate when parsing yields zero entries.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../agent_framework_foundry/_foundry_evals.py | 99 +++++++++++++------
 .../foundry/tests/test_foundry_evals.py       | 50 ++++++++++
 2 files changed, 117 insertions(+), 32 deletions(-)

diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index 2b8d7913e08..ae9de97d6cf 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -28,7 +28,7 @@
 
 import asyncio
 import logging
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Literal, cast
 
@@ -723,42 +723,31 @@ def _extract_per_evaluator(run: RunRetrieveResponse) -> dict[str, dict[str, int]
     return per_eval
 
 
-def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None:
-    """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload.
+_RUBRIC_DIMENSION_KEYS: tuple[str, ...] = ("dimension_scores", "rubric_scores")
+"""Property keys that may carry per-dimension rubric breakdowns.
 
-    Foundry rubric evaluators include a per-dimension breakdown under
-    ``properties.rubric_scores`` on each result.  The exact location may
-    vary across SDK versions, so this helper accepts a few shapes:
+The published Foundry rubric-evaluator output format uses
+``properties.dimension_scores`` (see the Microsoft Learn "Rubric
+evaluators" reference).  Earlier preview builds and some SDK shapes
+used ``rubric_scores``; we accept both for defensive forward/backward
+compatibility.
+"""
 
-    * The SDK ``sample`` object exposes ``properties.rubric_scores``.
-    * The ``sample`` is a dict containing ``properties.rubric_scores``.
-    * The ``sample`` is a dict with ``rubric_scores`` at the top level.
 
-    Returns ``None`` when no rubric scores are present (i.e. the
-    evaluator was not a rubric evaluator).
-    """
-    if sample is None:
-        return None
-
-    raw: Any = None
-    properties: Any = getattr(sample, "properties", None)
-    if properties is not None:
-        raw = getattr(properties, "rubric_scores", None)
-        if raw is None and isinstance(properties, dict):
-            raw = cast("dict[str, Any]", properties).get("rubric_scores")
-    if raw is None and isinstance(sample, dict):
-        sample_any = cast("dict[str, Any]", sample)
-        props_dict: Any = sample_any.get("properties")
-        if isinstance(props_dict, dict):
-            raw = cast("dict[str, Any]", props_dict).get("rubric_scores")
-        if raw is None:
-            raw = sample_any.get("rubric_scores")
+def _parse_dimension_entries(raw: Any) -> list[RubricScore]:
+    """Parse a raw list-like payload into ``RubricScore`` instances.
 
+    Returns an empty list when ``raw`` is falsy, not iterable, or
+    contains no well-formed entries.
+    """
     if not raw:
-        return None
+        return []
+    try:
+        raw_iter: Iterable[Any] = iter(raw)
+    except TypeError:
+        return []
 
     parsed: list[RubricScore] = []
-    raw_iter: Any = raw
     for raw_entry in raw_iter:
         entry: Any = raw_entry
         try:
@@ -792,8 +781,54 @@ def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None:
                 )
             )
         except (TypeError, ValueError):
-            logger.debug("Skipping malformed rubric_scores entry: %s", cast("Any", entry), exc_info=True)
-    return parsed or None
+            logger.debug("Skipping malformed rubric dimension entry: %s", cast("Any", entry), exc_info=True)
+    return parsed
+
+
+def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None:
+    """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload.
+
+    Foundry rubric evaluators include a per-dimension breakdown under
+    ``properties.dimension_scores`` on each result (preview builds used
+    ``rubric_scores``; both keys are accepted, with the canonical
+    ``dimension_scores`` taking priority).  The exact location may
+    vary across SDK versions, so this helper accepts a few shapes:
+
+    * The SDK ``sample`` object exposes
+      ``properties.dimension_scores`` / ``properties.rubric_scores``.
+    * The ``sample`` is a dict containing the same under
+      ``properties.<key>``.
+    * The ``sample`` is a dict with ``dimension_scores`` /
+      ``rubric_scores`` at the top level.
+
+    Returns ``None`` when no rubric scores are present (i.e. the
+    evaluator was not a rubric evaluator).
+    """
+    if sample is None:
+        return None
+
+    containers: list[Any] = []
+    properties: Any = getattr(sample, "properties", None)
+    if properties is not None:
+        containers.append(properties)
+    if isinstance(sample, dict):
+        sample_any = cast("dict[str, Any]", sample)
+        props_dict: Any = sample_any.get("properties")
+        if props_dict is not None and props_dict is not properties:
+            containers.append(props_dict)
+        containers.append(sample_any)
+
+    for container in containers:
+        for key in _RUBRIC_DIMENSION_KEYS:
+            raw: Any = None
+            if isinstance(container, dict):
+                raw = cast("dict[str, Any]", container).get(key)
+            elif hasattr(container, key):
+                raw = getattr(container, key, None)
+            parsed = _parse_dimension_entries(raw)
+            if parsed:
+                return parsed
+    return None
 
 
 async def _fetch_output_items(
diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index 7244347e05b..bffb0c066a7 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -2605,6 +2605,56 @@ def test_skips_malformed_entries(self) -> None:
         assert len(result) == 1
         assert result[0].id == "good"
 
+    def test_canonical_dimension_scores_key_from_docs(self) -> None:
+        """Per the Microsoft Learn docs, runtime output uses ``properties.dimension_scores``."""
+        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
+
+        sample = {
+            "properties": {
+                "dimension_scores": [
+                    {
+                        "id": "intent_recognition",
+                        "score": 5,
+                        "applicable": True,
+                        "weight": 9,
+                        "reason": "Identified correctly.",
+                    },
+                    {
+                        "id": "general_quality",
+                        "score": 4,
+                        "applicable": True,
+                        "weight": 5,
+                        "reason": "Strong overall.",
+                    },
+                ]
+            }
+        }
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert [r.id for r in result] == ["intent_recognition", "general_quality"]
+        assert [r.score for r in result] == [5, 4]
+        assert [r.weight for r in result] == [9, 5]
+
+    def test_dimension_scores_via_attribute(self) -> None:
+        """Canonical key also resolves when properties exposes ``dimension_scores`` as an attr."""
+        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
+
+        rs = MagicMock()
+        rs.id = "policy_enforcement"
+        rs.score = 1
+        rs.applicable = True
+        rs.weight = 5
+        rs.reason = "violated"
+
+        sample = MagicMock()
+        sample.properties = MagicMock(spec=["dimension_scores"])
+        sample.properties.dimension_scores = [rs]
+
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert result[0].id == "policy_enforcement"
+        assert result[0].score == 1
+
 
 # ---------------------------------------------------------------------------
 # _poll_eval_run — timeout / failed / canceled paths

From f76343059d2e8648d672ed92a6b475e4524b9e87 Mon Sep 17 00:00:00 2001
From: alliscode <25218250+alliscode@users.noreply.github.com>
Date: Thu, 28 May 2026 09:23:28 -0700
Subject: [PATCH 10/16] feat(foundry-evals): add manual create_rubric_evaluator

Adds FoundryEvals.create_rubric_evaluator as the agent-framework surface over project_client.beta.evaluators.create_version. This is the manual counterpart to generate_rubric: callers supply RubricDimension instances (authored locally, ported from another framework, or hand-tuned) and we POST a RubricBasedEvaluatorDefinition. The service auto-attaches the non-editable residual dimension (general_quality for quality, general_policy_compliance for safety).

Per the Microsoft Learn 'Rubric evaluators' reference, the auto-generation path (create_generation_job) is primarily a portal/UI feature; external SDK clients with rich local agent context are better served by manual create_version. This keeps generate_rubric for users who want to round-trip through a Foundry-registered agent.

Validation up front: weight must be in [1,10], ids unique, descriptions non-empty, pass_threshold in [0,1]. The returned GeneratedEvaluatorRef is identical in shape to one obtained from generate_rubric, so downstream evaluators= lists work unchanged.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../agent_framework_foundry/_foundry_evals.py | 227 +++++++++++++++++
 .../foundry/tests/test_foundry_evals.py       | 241 ++++++++++++++++++
 2 files changed, 468 insertions(+)

diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index ae9de97d6cf..7fddd64c38c 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -1349,6 +1349,104 @@ async def generate_rubric(
 
         return _generation_job_to_ref(completed, category=category)
 
+    @classmethod
+    @experimental(feature_id=ExperimentalFeature.EVALS)
+    async def create_rubric_evaluator(
+        cls,
+        *,
+        project_client: AIProjectClient,
+        name: str,
+        dimensions: Sequence[RubricDimension],
+        category: Literal["quality", "safety"] = "quality",
+        pass_threshold: float | None = None,
+        display_name: str | None = None,
+        description: str | None = None,
+        tags: dict[str, str] | None = None,
+        metadata: dict[str, str] | None = None,
+    ) -> GeneratedEvaluatorRef:
+        """Register a rubric evaluator from caller-supplied dimensions.
+
+        This is the *manual* counterpart to :meth:`generate_rubric` and
+        maps directly to ``project_client.beta.evaluators.create_version``.
+        Use it to bring a rubric you authored elsewhere (e.g. authored
+        from an agent's local context, ported from another framework, or
+        hand-tuned) into Foundry as a versioned ``EvaluatorVersion``
+        that any subsequent ``evaluators=`` list can reference via the
+        returned :class:`GeneratedEvaluatorRef`.
+
+        The service auto-attaches a non-editable residual dimension
+        (``general_quality`` for ``category="quality"``,
+        ``general_policy_compliance`` for ``"safety"``) — do not include
+        it in ``dimensions``.
+
+        Keyword Args:
+            project_client: Async ``AIProjectClient`` for the target
+                Foundry project.
+            name: Stable evaluator name (e.g.
+                ``"reservation-agent-policy-v1"``). A new version is
+                allocated on each call.
+            dimensions: One or more :class:`RubricDimension` instances
+                describing the scoring blueprint. Each dimension's
+                ``id`` must be unique; ``weight`` must be in ``[1, 10]``.
+            category: ``"quality"`` (default) or ``"safety"``.
+            pass_threshold: Optional aggregate pass threshold on the
+                normalized 0.0-1.0 scale. Defaults to the service-side
+                default of ``0.5`` when omitted.
+            display_name: Optional human-readable name shown in the
+                Foundry portal.
+            description: Optional asset description.
+            tags: Optional asset tags.
+            metadata: Optional free-form metadata persisted with the
+                evaluator definition.
+
+        Returns:
+            A pinned :class:`GeneratedEvaluatorRef` referring to the
+            newly created evaluator version.
+
+        Raises:
+            ValueError: If ``dimensions`` is empty, contains duplicate
+                ids, or contains a weight outside ``[1, 10]``.
+            NotImplementedError: If the installed ``azure-ai-projects``
+                version does not expose the manual rubric APIs.
+        """
+        if category not in ("quality", "safety"):
+            raise ValueError(f"category must be 'quality' or 'safety', got {category!r}.")
+        if pass_threshold is not None and not (0.0 <= pass_threshold <= 1.0):
+            raise ValueError(f"pass_threshold must be in [0.0, 1.0] when set (got {pass_threshold!r}).")
+        if not dimensions:
+            raise ValueError("create_rubric_evaluator requires at least one dimension.")
+
+        try:
+            sdk_types = _import_manual_rubric_sdk_types()
+        except _RubricSdkUnavailableError as exc:
+            raise NotImplementedError(str(exc)) from exc
+
+        sdk_dimensions = _to_sdk_dimensions(dimensions, sdk_types.Dimension)
+        definition_kwargs: dict[str, Any] = {"dimensions": sdk_dimensions}
+        if pass_threshold is not None:
+            definition_kwargs["pass_threshold"] = pass_threshold
+        definition = sdk_types.RubricBasedEvaluatorDefinition(**definition_kwargs)
+
+        version_kwargs: dict[str, Any] = {
+            "evaluator_type": "custom",
+            "categories": [category],
+            "definition": definition,
+        }
+        if display_name is not None:
+            version_kwargs["display_name"] = display_name
+        if description is not None:
+            version_kwargs["description"] = description
+        if tags is not None:
+            version_kwargs["tags"] = tags
+        if metadata is not None:
+            version_kwargs["metadata"] = metadata
+
+        evaluator_version = sdk_types.EvaluatorVersion(**version_kwargs)
+        evaluators_ops = _get_beta_evaluators(project_client)
+        created = await evaluators_ops.create_version(name, evaluator_version=evaluator_version)
+
+        return _evaluator_version_to_ref(created, fallback_name=name, category=category)
+
 
 _TERMINAL_GENERATION_STATUSES: frozenset[str] = frozenset({"succeeded", "failed", "cancelled", "canceled"})
 
@@ -1369,6 +1467,15 @@ class _GenerationSdkTypes:
     TracesSource: Any | None
 
 
+@dataclass(frozen=True)
+class _ManualRubricSdkTypes:
+    """Resolved SDK type handles for manual rubric-evaluator creation."""
+
+    EvaluatorVersion: Any
+    RubricBasedEvaluatorDefinition: Any
+    Dimension: Any
+
+
 _RUBRIC_SDK_MISSING_MSG = (
     "FoundryEvals.generate_rubric requires the rubric-evaluator generation APIs "
     "from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev feed). "
@@ -1378,6 +1485,16 @@ class _GenerationSdkTypes:
 )
 
 
+_MANUAL_RUBRIC_SDK_MISSING_MSG = (
+    "FoundryEvals.create_rubric_evaluator requires the manual rubric-evaluator "
+    "APIs from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev "
+    "feed). Install a build that exposes "
+    "`azure.ai.projects.models.RubricBasedEvaluatorDefinition`, "
+    "`azure.ai.projects.models.Dimension`, and "
+    "`AIProjectClient.beta.evaluators.create_version`."
+)
+
+
 def _import_generation_sdk_types() -> _GenerationSdkTypes:
     """Lazily resolve the rubric-generation SDK types from azure-ai-projects."""
     try:
@@ -1406,6 +1523,116 @@ def _import_generation_sdk_types() -> _GenerationSdkTypes:
     )
 
 
+def _import_manual_rubric_sdk_types() -> _ManualRubricSdkTypes:
+    """Lazily resolve the manual rubric-evaluator SDK types from azure-ai-projects."""
+    try:
+        from azure.ai.projects import models as _models  # type: ignore[import-not-found]
+    except ImportError as exc:
+        raise _RubricSdkUnavailableError(_MANUAL_RUBRIC_SDK_MISSING_MSG) from exc
+
+    models_mod: Any = _models
+    version_cls: Any = getattr(models_mod, "EvaluatorVersion", None)
+    definition_cls: Any = getattr(models_mod, "RubricBasedEvaluatorDefinition", None)
+    dimension_cls: Any = getattr(models_mod, "Dimension", None)
+    if version_cls is None or definition_cls is None or dimension_cls is None:
+        raise _RubricSdkUnavailableError(_MANUAL_RUBRIC_SDK_MISSING_MSG)
+
+    return _ManualRubricSdkTypes(
+        EvaluatorVersion=version_cls,
+        RubricBasedEvaluatorDefinition=definition_cls,
+        Dimension=dimension_cls,
+    )
+
+
+def _to_sdk_dimensions(
+    dimensions: Sequence[RubricDimension],
+    dimension_cls: Any,
+) -> list[Any]:
+    """Translate user-facing ``RubricDimension`` instances to SDK ``Dimension`` models.
+
+    The agent-framework type uses ``id`` (matching the runtime output
+    schema and competing frameworks); the SDK input model uses
+    ``dimension_id`` for the editable identifier.
+    """
+    if not dimensions:
+        raise ValueError("create_rubric_evaluator requires at least one dimension.")
+    seen: set[str] = set()
+    sdk_dims: list[Any] = []
+    for dim in dimensions:
+        if not dim.id:
+            raise ValueError("RubricDimension.id must be a non-empty string.")
+        if not dim.description:
+            raise ValueError(f"RubricDimension(id={dim.id!r}).description must be non-empty.")
+        if not isinstance(dim.weight, int) or not (1 <= dim.weight <= 10):
+            raise ValueError(f"RubricDimension(id={dim.id!r}).weight must be an int in [1, 10] (got {dim.weight!r}).")
+        if dim.id in seen:
+            raise ValueError(f"Duplicate RubricDimension.id={dim.id!r}; ids must be unique within a rubric.")
+        seen.add(dim.id)
+        kwargs: dict[str, Any] = {
+            "dimension_id": dim.id,
+            "description": dim.description,
+            "weight": dim.weight,
+        }
+        if dim.always_applicable:
+            kwargs["always_applicable"] = True
+        sdk_dims.append(dimension_cls(**kwargs))
+    return sdk_dims
+
+
+def _evaluator_version_to_ref(
+    created: Any,
+    *,
+    fallback_name: str,
+    category: Literal["quality", "safety"],
+) -> GeneratedEvaluatorRef:
+    """Translate a persisted ``EvaluatorVersion`` to a :class:`GeneratedEvaluatorRef`.
+
+    Used by both the generation-job path and the manual ``create_version``
+    path so callers see a uniform pinned reference regardless of how the
+    evaluator was authored.
+    """
+    ev_name = getattr(created, "name", None) or fallback_name
+    ev_version = getattr(created, "version", None)
+    if ev_version is None:
+        raise RuntimeError("Created evaluator version is missing a version identifier.")
+
+    definition: Any = getattr(created, "definition", None)
+    dimensions: tuple[RubricDimension, ...] | None = None
+    raw_dims: Any = getattr(definition, "dimensions", None) if definition is not None else None
+    if raw_dims:
+        parsed: list[RubricDimension] = []
+        for entry in raw_dims:
+            dim_id = getattr(entry, "dimension_id", None) or getattr(entry, "id", None)
+            try:
+                parsed.append(
+                    RubricDimension(
+                        id=str(dim_id or ""),
+                        description=str(getattr(entry, "description", "") or ""),
+                        weight=int(getattr(entry, "weight", 0) or 0),
+                        always_applicable=bool(getattr(entry, "always_applicable", False)),
+                    )
+                )
+            except (TypeError, ValueError):
+                logger.debug("Skipping malformed dimension on persisted evaluator", exc_info=True)
+        if parsed:
+            dimensions = tuple(parsed)
+
+    pass_threshold: float | None = None
+    raw_threshold: Any = getattr(definition, "pass_threshold", None) if definition is not None else None
+    if isinstance(raw_threshold, (int, float)):
+        pass_threshold = float(raw_threshold)
+
+    return GeneratedEvaluatorRef(
+        name=str(ev_name),
+        version=str(ev_version),
+        category=category,
+        display_name=getattr(created, "display_name", None),
+        description=getattr(created, "description", None),
+        dimensions=dimensions,
+        pass_threshold=pass_threshold,
+    )
+
+
 def _get_beta_evaluators(project_client: AIProjectClient) -> Any:
     """Return the ``project_client.beta.evaluators`` operations group, or raise."""
     beta = getattr(project_client, "beta", None)
diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index bffb0c066a7..d24c528a744 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -27,6 +27,7 @@
 
 from agent_framework_foundry._foundry_evals import (
     FoundryEvals,
+    RubricDimension,
     _build_item_schema,
     _build_testing_criteria,
     _extract_per_evaluator,
@@ -3530,3 +3531,243 @@ async def test_generate_rubric_from_agent(self, monkeypatch: pytest.MonkeyPatch)
 
         job_cls.assert_called_once_with(inputs="sdk-inputs")
         evaluators_ops.create_generation_job.assert_awaited_once_with(job="sdk-job", operation_id="op-123")
+
+
+# ---------------------------------------------------------------------------
+# FoundryEvals.create_rubric_evaluator — manual rubric registration
+# ---------------------------------------------------------------------------
+
+
+class TestCreateRubricEvaluatorValidation:
+    """Argument validation for ``FoundryEvals.create_rubric_evaluator``."""
+
+    async def test_rejects_empty_dimensions(self) -> None:
+        with pytest.raises(ValueError, match="at least one dimension"):
+            await FoundryEvals.create_rubric_evaluator(
+                project_client=MagicMock(),
+                name="x",
+                dimensions=[],
+            )
+
+    async def test_rejects_invalid_category(self) -> None:
+        with pytest.raises(ValueError, match="category"):
+            await FoundryEvals.create_rubric_evaluator(
+                project_client=MagicMock(),
+                name="x",
+                dimensions=[RubricDimension(id="a", description="d", weight=5)],
+                category="bogus",  # type: ignore[arg-type]
+            )
+
+    async def test_rejects_out_of_range_pass_threshold(self) -> None:
+        with pytest.raises(ValueError, match="pass_threshold"):
+            await FoundryEvals.create_rubric_evaluator(
+                project_client=MagicMock(),
+                name="x",
+                dimensions=[RubricDimension(id="a", description="d", weight=5)],
+                pass_threshold=1.5,
+            )
+
+    async def test_rejects_duplicate_dimension_ids(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from agent_framework_foundry import _foundry_evals as fm
+
+        sdk = fm._ManualRubricSdkTypes(
+            EvaluatorVersion=MagicMock(),
+            RubricBasedEvaluatorDefinition=MagicMock(),
+            Dimension=MagicMock(),
+        )
+        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk)
+        with pytest.raises(ValueError, match="Duplicate"):
+            await FoundryEvals.create_rubric_evaluator(
+                project_client=MagicMock(),
+                name="x",
+                dimensions=[
+                    RubricDimension(id="dup", description="d1", weight=5),
+                    RubricDimension(id="dup", description="d2", weight=3),
+                ],
+            )
+
+    async def test_rejects_weight_out_of_range(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from agent_framework_foundry import _foundry_evals as fm
+
+        sdk = fm._ManualRubricSdkTypes(
+            EvaluatorVersion=MagicMock(),
+            RubricBasedEvaluatorDefinition=MagicMock(),
+            Dimension=MagicMock(),
+        )
+        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk)
+        with pytest.raises(ValueError, match="weight"):
+            await FoundryEvals.create_rubric_evaluator(
+                project_client=MagicMock(),
+                name="x",
+                dimensions=[RubricDimension(id="a", description="d", weight=0)],
+            )
+
+    async def test_rejects_empty_description(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from agent_framework_foundry import _foundry_evals as fm
+
+        sdk = fm._ManualRubricSdkTypes(
+            EvaluatorVersion=MagicMock(),
+            RubricBasedEvaluatorDefinition=MagicMock(),
+            Dimension=MagicMock(),
+        )
+        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk)
+        with pytest.raises(ValueError, match="description"):
+            await FoundryEvals.create_rubric_evaluator(
+                project_client=MagicMock(),
+                name="x",
+                dimensions=[RubricDimension(id="a", description="", weight=5)],
+            )
+
+
+class TestCreateRubricEvaluatorSdkMissing:
+    async def test_raises_not_implemented_when_sdk_lacks_types(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from agent_framework_foundry import _foundry_evals as fm
+
+        def _raise() -> Any:
+            raise fm._RubricSdkUnavailableError("nope")
+
+        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", _raise)
+        with pytest.raises(NotImplementedError, match="nope"):
+            await FoundryEvals.create_rubric_evaluator(
+                project_client=MagicMock(),
+                name="x",
+                dimensions=[RubricDimension(id="a", description="d", weight=5)],
+            )
+
+
+class TestCreateRubricEvaluatorE2E:
+    """End-to-end happy path for create_rubric_evaluator with mocked SDK."""
+
+    async def test_calls_create_version_with_rubric_definition(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from agent_framework_foundry import _foundry_evals as fm
+
+        dimension_cls = MagicMock(name="Dimension", side_effect=lambda **kw: ("dim", kw))
+        definition_cls = MagicMock(name="RubricBasedEvaluatorDefinition", side_effect=lambda **kw: ("def", kw))
+        version_cls = MagicMock(name="EvaluatorVersion", side_effect=lambda **kw: ("ver", kw))
+
+        sdk = fm._ManualRubricSdkTypes(
+            EvaluatorVersion=version_cls,
+            RubricBasedEvaluatorDefinition=definition_cls,
+            Dimension=dimension_cls,
+        )
+        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk)
+
+        created_definition = MagicMock()
+        created_definition.dimensions = [
+            MagicMock(dimension_id="intent", description="d1", weight=9, always_applicable=False),
+            MagicMock(dimension_id="general_quality", description="g", weight=5, always_applicable=True),
+        ]
+        created_definition.pass_threshold = 0.7
+        created_version = MagicMock(
+            display_name="DN",
+            description="hand-authored",
+        )
+        created_version.name = "policy-eval"
+        created_version.version = "3"
+        created_version.definition = created_definition
+
+        evaluators_ops = MagicMock()
+        evaluators_ops.create_version = AsyncMock(return_value=created_version)
+        project_client = MagicMock()
+        project_client.beta = MagicMock(evaluators=evaluators_ops)
+
+        ref = await FoundryEvals.create_rubric_evaluator(
+            project_client=project_client,
+            name="policy-eval",
+            dimensions=[
+                RubricDimension(id="intent", description="d1", weight=9),
+                RubricDimension(id="general_quality", description="g", weight=5, always_applicable=True),
+            ],
+            category="quality",
+            pass_threshold=0.7,
+            display_name="DN",
+            description="hand-authored",
+            tags={"team": "agents"},
+            metadata={"source": "manual"},
+        )
+
+        # Returned ref carries the persisted (name, version) and snapshot of dimensions.
+        assert ref.name == "policy-eval"
+        assert ref.version == "3"
+        assert ref.category == "quality"
+        assert ref.pass_threshold == 0.7
+        assert ref.dimensions is not None
+        assert [d.id for d in ref.dimensions] == ["intent", "general_quality"]
+        assert ref.dimensions[1].always_applicable is True
+
+        # Dimension construction used dimension_id, included always_applicable only when True.
+        assert dimension_cls.call_count == 2
+        first_kwargs = dimension_cls.call_args_list[0].kwargs
+        assert first_kwargs == {"dimension_id": "intent", "description": "d1", "weight": 9}
+        second_kwargs = dimension_cls.call_args_list[1].kwargs
+        assert second_kwargs == {
+            "dimension_id": "general_quality",
+            "description": "g",
+            "weight": 5,
+            "always_applicable": True,
+        }
+
+        # Definition construction forwarded pass_threshold and the two sdk dimensions.
+        definition_cls.assert_called_once()
+        def_kwargs = definition_cls.call_args.kwargs
+        assert def_kwargs["pass_threshold"] == 0.7
+        assert def_kwargs["dimensions"] == [
+            ("dim", {"dimension_id": "intent", "description": "d1", "weight": 9}),
+            (
+                "dim",
+                {
+                    "dimension_id": "general_quality",
+                    "description": "g",
+                    "weight": 5,
+                    "always_applicable": True,
+                },
+            ),
+        ]
+
+        # EvaluatorVersion construction passed evaluator_type="custom", category list, and optionals.
+        version_cls.assert_called_once()
+        ver_kwargs = version_cls.call_args.kwargs
+        assert ver_kwargs["evaluator_type"] == "custom"
+        assert ver_kwargs["categories"] == ["quality"]
+        assert ver_kwargs["display_name"] == "DN"
+        assert ver_kwargs["description"] == "hand-authored"
+        assert ver_kwargs["tags"] == {"team": "agents"}
+        assert ver_kwargs["metadata"] == {"source": "manual"}
+
+        # SDK ops invoked with name + evaluator_version kwarg.
+        evaluators_ops.create_version.assert_awaited_once()
+        call = evaluators_ops.create_version.await_args
+        assert call.args == ("policy-eval",)
+        assert "evaluator_version" in call.kwargs
+
+    async def test_omits_pass_threshold_when_not_set(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from agent_framework_foundry import _foundry_evals as fm
+
+        dimension_cls = MagicMock(side_effect=lambda **kw: kw)
+        definition_cls = MagicMock(side_effect=lambda **kw: kw)
+        version_cls = MagicMock(side_effect=lambda **kw: kw)
+
+        sdk = fm._ManualRubricSdkTypes(
+            EvaluatorVersion=version_cls,
+            RubricBasedEvaluatorDefinition=definition_cls,
+            Dimension=dimension_cls,
+        )
+        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk)
+
+        created = MagicMock(display_name=None, description=None)
+        created.name = "x"
+        created.version = "1"
+        created.definition = MagicMock(dimensions=[], pass_threshold=None)
+
+        evaluators_ops = MagicMock()
+        evaluators_ops.create_version = AsyncMock(return_value=created)
+        project_client = MagicMock()
+        project_client.beta = MagicMock(evaluators=evaluators_ops)
+
+        ref = await FoundryEvals.create_rubric_evaluator(
+            project_client=project_client,
+            name="x",
+            dimensions=[RubricDimension(id="a", description="d", weight=5)],
+        )
+        assert ref.pass_threshold is None
+        assert "pass_threshold" not in definition_cls.call_args.kwargs

From 484b98d44256583d48f4c47acea1fcfe13e81ce1 Mon Sep 17 00:00:00 2001
From: alliscode <25218250+alliscode@users.noreply.github.com>
Date: Thu, 28 May 2026 09:38:02 -0700
Subject: [PATCH 11/16] samples(foundry-evals): manual rubric sample +
 namespace re-exports

Adds evaluate_with_manual_rubric_sample.py demonstrating the end-to-end dev scenario for FoundryEvals.create_rubric_evaluator: hand-author a list of RubricDimension, register via create_rubric_evaluator, then use the pinned GeneratedEvaluatorRef alongside built-in evaluators in an agent regression run.

Also re-exports RubricDimension, GeneratedEvaluatorRef, build_sources, and load_evals_config from agent_framework.foundry (both the lazy runtime shim and the type stub) so the rubric samples can import everything from a single namespace; the auto-generate sample was previously broken because the shim was missing build_sources / load_evals_config.

Updates the foundry-evals README with a chooser entry for the two rubric paths.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../core/agent_framework/foundry/__init__.py  |   4 +
 .../core/agent_framework/foundry/__init__.pyi |   8 +
 .../evaluation/foundry_evals/README.md        |  22 +++
 .../evaluate_with_manual_rubric_sample.py     | 172 ++++++++++++++++++
 4 files changed, 206 insertions(+)
 create mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py

diff --git a/python/packages/core/agent_framework/foundry/__init__.py b/python/packages/core/agent_framework/foundry/__init__.py
index 82a476ddff8..2abd2b58e90 100644
--- a/python/packages/core/agent_framework/foundry/__init__.py
+++ b/python/packages/core/agent_framework/foundry/__init__.py
@@ -34,13 +34,17 @@
     "FoundryLocalChatOptions": ("agent_framework_foundry_local", "agent-framework-foundry-local"),
     "FoundryLocalClient": ("agent_framework_foundry_local", "agent-framework-foundry-local"),
     "FoundryLocalSettings": ("agent_framework_foundry_local", "agent-framework-foundry-local"),
+    "GeneratedEvaluatorRef": ("agent_framework_foundry", "agent-framework-foundry"),
     "RawAnthropicFoundryClient": ("agent_framework_anthropic", "agent-framework-anthropic"),
     "RawFoundryAgent": ("agent_framework_foundry", "agent-framework-foundry"),
     "RawFoundryAgentChatClient": ("agent_framework_foundry", "agent-framework-foundry"),
     "RawFoundryChatClient": ("agent_framework_foundry", "agent-framework-foundry"),
     "RawFoundryEmbeddingClient": ("agent_framework_foundry", "agent-framework-foundry"),
+    "RubricDimension": ("agent_framework_foundry", "agent-framework-foundry"),
+    "build_sources": ("agent_framework_foundry", "agent-framework-foundry"),
     "evaluate_foundry_target": ("agent_framework_foundry", "agent-framework-foundry"),
     "evaluate_traces": ("agent_framework_foundry", "agent-framework-foundry"),
+    "load_evals_config": ("agent_framework_foundry", "agent-framework-foundry"),
 }
 
 
diff --git a/python/packages/core/agent_framework/foundry/__init__.pyi b/python/packages/core/agent_framework/foundry/__init__.pyi
index 7deb709c2a3..abcf45868f2 100644
--- a/python/packages/core/agent_framework/foundry/__init__.pyi
+++ b/python/packages/core/agent_framework/foundry/__init__.pyi
@@ -20,12 +20,16 @@ from agent_framework_foundry import (
     FoundryEmbeddingSettings,
     FoundryEvals,
     FoundryMemoryProvider,
+    GeneratedEvaluatorRef,
     RawFoundryAgent,
     RawFoundryAgentChatClient,
     RawFoundryChatClient,
     RawFoundryEmbeddingClient,
+    RubricDimension,
+    build_sources,
     evaluate_foundry_target,
     evaluate_traces,
+    load_evals_config,
 )
 from agent_framework_foundry_local import (
     FoundryLocalChatOptions,
@@ -51,11 +55,15 @@ __all__ = [
     "FoundryLocalClient",
     "FoundryLocalSettings",
     "FoundryMemoryProvider",
+    "GeneratedEvaluatorRef",
     "RawAnthropicFoundryClient",
     "RawFoundryAgent",
     "RawFoundryAgentChatClient",
     "RawFoundryChatClient",
     "RawFoundryEmbeddingClient",
+    "RubricDimension",
+    "build_sources",
     "evaluate_foundry_target",
     "evaluate_traces",
+    "load_evals_config",
 ]
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
index 81412a7f0ef..b7f8f7cc1b6 100644
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
@@ -35,6 +35,26 @@ Evaluate what already happened — zero changes to agent code:
 uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py
 ```
 
+### `evaluate_with_generated_rubric_sample.py` — Auto-Generate a Rubric
+
+Let Foundry draft the rubric dimensions for you from the agent's
+context (instructions, tools, description).  Best when you don't yet
+have a fixed scoring rubric and want a strong baseline you can refine.
+
+```bash
+uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py
+```
+
+### `evaluate_with_manual_rubric_sample.py` — Author a Rubric Yourself
+
+Bring your own `RubricDimension`s (from a spec, a competing framework,
+or hand tuning) and register them as a versioned evaluator.  Use this
+when you already know what you want to score.
+
+```bash
+uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py
+```
+
 ## Setup
 
 Create a `.env` file with configuration as in the `.env.example` file in this folder.
@@ -44,3 +64,5 @@ Create a `.env` file with configuration as in the `.env.example` file in this fo
 - **"I want to test my agent during development"** → `evaluate_agent_sample.py`, Pattern 1
 - **"I want to evaluate past agent runs"** → `evaluate_traces_sample.py`
 - **"I want to inspect/modify eval data before submitting"** → `evaluate_agent_sample.py`, Pattern 2
+- **"I want Foundry to draft a custom rubric for my agent"** → `evaluate_with_generated_rubric_sample.py`
+- **"I already have a rubric I want to bring into Foundry"** → `evaluate_with_manual_rubric_sample.py`
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py
new file mode 100644
index 00000000000..e1fc86ef71c
--- /dev/null
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py
@@ -0,0 +1,172 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Register a hand-authored rubric evaluator and use it in CI.
+
+This sample demonstrates the *manual* counterpart to
+``evaluate_with_generated_rubric_sample.py``:
+
+1. Build an agent.
+2. Author the rubric dimensions yourself — useful when you have an
+   established scoring rubric (from a spec, a competing framework, or
+   prior hand tuning) that you want to bring into Foundry as-is.
+3. Register the rubric with
+   :meth:`FoundryEvals.create_rubric_evaluator` — this maps directly to
+   ``project_client.beta.evaluators.create_version`` and returns a
+   pinned ``GeneratedEvaluatorRef`` you can store in source control.
+4. Use the pinned reference in ``evaluators=[...]`` for a regression run
+   alongside built-in evaluators.
+
+The service auto-attaches a non-editable residual dimension
+(``general_quality`` for ``category="quality"``,
+``general_policy_compliance`` for ``"safety"``) — do not include it in
+``dimensions``.
+
+Prefer :meth:`FoundryEvals.generate_rubric` if you want Foundry to
+draft the dimensions for you from the agent's context.  Use this manual
+flow when you already know what you want to score.
+
+Prerequisites:
+- An Azure AI Foundry project with a deployed model.
+- ``azure-ai-projects`` build that includes the rubric APIs (currently
+  ``2.3.0a*`` on the Azure SDK Python dev feed).
+- Set ``FOUNDRY_PROJECT_ENDPOINT`` and ``FOUNDRY_MODEL`` in ``.env``.
+
+Run with:
+
+.. code-block:: bash
+
+    az login
+    python evaluate_with_manual_rubric_sample.py
+"""
+
+import asyncio
+import os
+
+from agent_framework import evaluate_agent
+from agent_framework.foundry import (
+    FoundryChatClient,
+    FoundryEvals,
+    RubricDimension,
+)
+from azure.ai.projects.aio import AIProjectClient
+from azure.identity.aio import AzureCliCredential
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+def get_weather(location: str) -> str:
+    """Get the current weather for a location."""
+    samples = {
+        "seattle": "62F, cloudy with a chance of rain",
+        "london": "55F, overcast",
+        "paris": "68F, partly sunny",
+    }
+    return samples.get(location.lower(), f"Weather data not available for {location}")
+
+
+# Hand-authored rubric — this is the artifact you commit alongside the
+# agent so the rubric and the behavior it scores evolve together.
+# Weights are 1-10 (the generation pipeline biases one dimension to
+# 8-10; manual edits aren't constrained by this heuristic).
+TRAVEL_RUBRIC_DIMENSIONS: list[RubricDimension] = [
+    RubricDimension(
+        id="tool_grounding",
+        description=(
+            "Grounds every weather claim in tool output.  Does not invent values when "
+            "the tool returns no data, and does not paraphrase tool output in a way "
+            "that distorts the underlying values."
+        ),
+        weight=9,
+    ),
+    RubricDimension(
+        id="scope_adherence",
+        description=(
+            "Stays within travel-planning scope.  Politely declines or redirects "
+            "questions about topics unrelated to travel (e.g. general trivia, "
+            "personal advice, coding questions)."
+        ),
+        weight=6,
+    ),
+    RubricDimension(
+        id="actionable_recommendation",
+        description=(
+            "Provides a clear, actionable recommendation grounded in the tool result "
+            "(e.g. 'Pack an umbrella' when rain is reported), not just a restatement "
+            "of the raw weather data."
+        ),
+        weight=4,
+    ),
+]
+
+
+async def main() -> None:
+    project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
+    model_name = os.environ.get("FOUNDRY_MODEL", "gpt-4o")
+
+    credential = AzureCliCredential()
+    chat_client = FoundryChatClient(
+        project_endpoint=project_endpoint,
+        model=model_name,
+        credential=credential,
+    )
+    project_client = AIProjectClient(endpoint=project_endpoint, credential=credential)
+
+    agent = chat_client.as_agent(
+        name="travel-assistant",
+        instructions=(
+            "You are a helpful travel assistant.  Always ground recommendations in "
+            "tool output, cite each tool result, and refuse questions outside travel "
+            "planning."
+        ),
+        tools=[get_weather],
+    )
+
+    # 1. Register (or bump the version of) the hand-authored rubric.
+    # The service auto-attaches the non-editable `general_quality`
+    # residual dimension for quality rubrics.
+    print("Registering manual rubric evaluator...")
+    rubric_ref = await FoundryEvals.create_rubric_evaluator(
+        project_client=project_client,
+        name="travel-quality-manual",
+        dimensions=TRAVEL_RUBRIC_DIMENSIONS,
+        category="quality",
+        pass_threshold=0.6,
+        display_name="Travel Quality (Manual)",
+        description="Hand-authored rubric for the travel-assistant agent.",
+    )
+    print(
+        f"Registered rubric {rubric_ref.name}@{rubric_ref.version} "
+        f"with {len(rubric_ref.dimensions or ())} dimensions "
+        f"(pass_threshold={rubric_ref.pass_threshold})"
+    )
+
+    # 2. Run an evaluation that combines built-ins with the new rubric.
+    evals = FoundryEvals(
+        client=chat_client,
+        evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY, rubric_ref],
+    )
+    results = await evaluate_agent(
+        agent=agent,
+        queries=[
+            "What's the weather in Seattle?",
+            "Should I pack an umbrella for London?",
+            "What's the capital of France?",  # off-scope — exercises scope_adherence
+        ],
+        evaluators=evals,
+    )
+
+    # 3. Quality gates — wire these into your CI job's exit status.
+    for r in results:
+        print(f"\nRun {r.run_id}: {r.passed}/{r.total} passed; portal: {r.report_url}")
+        r.assert_no_failed_items()
+        r.assert_score_at_least(0.7)
+        r.assert_dimension_score_at_least("tool_grounding", 3)
+        r.assert_dimension_score_at_least("scope_adherence", 3)
+
+    await project_client.close()
+    await credential.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 972b55f70c73780a6fe4bc2c5cf4a8be1180bb98 Mon Sep 17 00:00:00 2001
From: alliscode <25218250+alliscode@users.noreply.github.com>
Date: Thu, 28 May 2026 10:01:37 -0700
Subject: [PATCH 12/16] feat(foundry-evals): remove rubric creation flows; keep
 consumption only

Reframes agent-framework as a pure consumer of Foundry rubric evaluators: scoring against rubrics that already exist (authored in the Foundry portal or via the dedicated SDK / REST surface) instead of creating them from the SDK.

Removed creation surface area:

- FoundryEvals.generate_rubric (auto-generate path) and create_rubric_evaluator (manual path), plus all _GenerationSdkTypes / _ManualRubricSdkTypes / _to_sdk_dimensions / _coalesce_generation_sources / _to_sdk_source / _poll_generation_job / _generation_job_to_ref / _evaluator_version_to_ref / _get_beta_evaluators / _import_*_sdk_types helpers.

- EvalGenerationSource (the input source discriminator), RubricDimension (the input dimension type), agent_as_eval_source / workflow_as_eval_source / _detect_hosted_foundry_agent helpers, and the YAML-config loader (_evals_config.py with RubricGenerationSpec / RubricSourceSpec / parse_evals_config / load_evals_config / build_sources).

- BaseAgent.as_eval_source / Workflow.as_eval_source plus the _render_agent_dossier / _render_workflow_dossier helpers in core. These existed only to feed the now-removed generation pipeline.

- Samples evaluate_with_generated_rubric_sample.py, evaluate_with_manual_rubric_sample.py, and evaluators.yaml. Replaced with a short README section showing how to reference an existing rubric evaluator via GeneratedEvaluatorRef.

Kept (consumption surface):

- GeneratedEvaluatorRef, slimmed to (name, version, display_name). Still accepted alongside built-in evaluator strings in FoundryEvals(evaluators=[...]). Versionless refs still warn.

- RubricScore on EvalScoreResult.dimensions plus EvalResults.assert_dimension_score_at_least for per-dimension CI gates.

- _parse_dimension_entries / _extract_rubric_scores output parsing (both canonical dimension_scores and the legacy rubric_scores key).

Tests: 160/160 foundry unit tests and 71/71 core local-eval tests pass; pyright is clean across changed files. The pre-existing tests/core/test_telemetry.py::test_detect_hosted_fallback_import_error failure is unrelated and reproduces on the prior commit.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../packages/core/agent_framework/_agents.py  |  43 -
 .../core/agent_framework/_evaluation.py       | 134 ---
 .../agent_framework/_workflows/_workflow.py   |  49 -
 .../core/agent_framework/foundry/__init__.py  |   3 -
 .../core/agent_framework/foundry/__init__.pyi |   6 -
 .../core/tests/core/test_local_eval.py        | 200 ----
 .../agent_framework_foundry/__init__.py       |  20 -
 .../agent_framework_foundry/_evals_config.py  | 403 --------
 .../agent_framework_foundry/_foundry_evals.py | 869 +-----------------
 .../foundry/tests/test_evals_config.py        | 273 ------
 .../foundry/tests/test_foundry_evals.py       | 755 +--------------
 .../evaluation/foundry_evals/README.md        |  43 +-
 .../evaluate_with_generated_rubric_sample.py  | 151 ---
 .../evaluate_with_manual_rubric_sample.py     | 172 ----
 .../evaluation/foundry_evals/evaluators.yaml  |  11 -
 15 files changed, 44 insertions(+), 3088 deletions(-)
 delete mode 100644 python/packages/foundry/agent_framework_foundry/_evals_config.py
 delete mode 100644 python/packages/foundry/tests/test_evals_config.py
 delete mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py
 delete mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py
 delete mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml

diff --git a/python/packages/core/agent_framework/_agents.py b/python/packages/core/agent_framework/_agents.py
index 65506cadc6f..585898ae523 100644
--- a/python/packages/core/agent_framework/_agents.py
+++ b/python/packages/core/agent_framework/_agents.py
@@ -444,49 +444,6 @@ def get_session(self, service_session_id: str, *, session_id: str | None = None)
         """
         return AgentSession(session_id=session_id, service_session_id=service_session_id)
 
-    def as_eval_source(
-        self,
-        *,
-        include_instructions: bool = True,
-        include_tools: bool = True,
-        include_context_providers: bool = False,
-        include_examples: bool = False,
-        examples: Sequence[str] | None = None,
-    ) -> str:
-        """Render this agent as a textual dossier for rubric-evaluator generation.
-
-        Packages the agent's name, description, instructions, tool
-        definitions, and optional context-provider class names into a
-        single plain-text dossier suitable for passing to a rubric
-        generation pipeline (e.g. ``FoundryEvals.generate_rubric``).
-
-        Defaults are conservative: instructions and tools are included;
-        examples and context-provider class names are not.
-
-        Keyword Args:
-            include_instructions: Whether to include the agent's
-                instructions text.
-            include_tools: Whether to include tool definitions.
-            include_context_providers: Whether to include attached
-                context-provider class names.
-            include_examples: Whether to include the supplied ``examples``.
-            examples: Sample queries / interactions to include when
-                ``include_examples`` is true.
-
-        Returns:
-            A plain-text dossier describing the agent.
-        """
-        from ._evaluation import _render_agent_dossier  # pyright: ignore[reportPrivateUsage]
-
-        return _render_agent_dossier(
-            self,
-            include_instructions=include_instructions,
-            include_tools=include_tools,
-            include_context_providers=include_context_providers,
-            include_examples=include_examples,
-            examples=examples,
-        )
-
     async def _run_after_providers(
         self,
         *,
diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py
index b14bdee9b22..52bdf90d0fc 100644
--- a/python/packages/core/agent_framework/_evaluation.py
+++ b/python/packages/core/agent_framework/_evaluation.py
@@ -673,140 +673,6 @@ class RubricScore:
     reason: str
 
 
-# endregion
-
-# region Eval source rendering
-
-
-def _render_agent_dossier(
-    agent: Any,
-    *,
-    include_instructions: bool,
-    include_tools: bool,
-    include_context_providers: bool,
-    include_examples: bool,
-    examples: Sequence[str] | None,
-) -> str:
-    """Render a structured, plain-text dossier of an agent for rubric generation."""
-    lines: list[str] = []
-    name = getattr(agent, "name", None) or "<unnamed agent>"
-    description = getattr(agent, "description", None)
-    lines.append(f"Agent name: {name}")
-    if description:
-        lines.append(f"Description: {description}")
-
-    if include_instructions:
-        instructions: str | None = None
-        default_options: Any = getattr(agent, "default_options", None)
-        if isinstance(default_options, dict):
-            raw_instr: Any = cast("dict[str, Any]", default_options).get("instructions")
-            if isinstance(raw_instr, str) and raw_instr.strip():
-                instructions = raw_instr
-        if instructions is None:
-            raw_instr = getattr(agent, "instructions", None)
-            if isinstance(raw_instr, str) and raw_instr.strip():
-                instructions = raw_instr
-        if instructions:
-            lines.append("")
-            lines.append("Instructions:")
-            lines.append(instructions.strip())
-
-    if include_tools:
-        tool_defs = AgentEvalConverter.extract_tools(agent)
-        if tool_defs:
-            lines.append("")
-            lines.append("Tools:")
-            for tool in tool_defs:
-                tool_line = f"- {tool['name']}"
-                tool_desc = tool.get("description")
-                if tool_desc:
-                    tool_line += f": {tool_desc}"
-                lines.append(tool_line)
-                params = tool.get("parameters")
-                if params:
-                    try:
-                        params_json = json.dumps(params, sort_keys=True)
-                    except (TypeError, ValueError):
-                        params_json = str(params)
-                    lines.append(f"  parameters: {params_json}")
-
-    if include_context_providers:
-        providers = getattr(agent, "context_providers", None)
-        if providers:
-            lines.append("")
-            lines.append("Context providers:")
-            for provider in providers:
-                lines.append(f"- {type(provider).__name__}")
-
-    if include_examples and examples:
-        lines.append("")
-        lines.append("Examples:")
-        for idx, example in enumerate(examples, start=1):
-            lines.append(f"{idx}. {example}")
-
-    return "\n".join(lines).strip()
-
-
-def _render_workflow_dossier(  # pyright: ignore[reportUnusedFunction]
-    workflow: Workflow,
-    *,
-    include_instructions: bool,
-    include_tools: bool,
-    include_context_providers: bool,
-    include_examples: bool,
-    examples: Sequence[str] | None,
-    include_topology: bool,
-) -> str:
-    """Render a structured, plain-text dossier of a workflow for rubric generation."""
-    from ._workflows._agent_executor import AgentExecutor as _AE
-
-    lines: list[str] = []
-    name = workflow.name or "<unnamed workflow>"
-    lines.append(f"Workflow name: {name}")
-    if workflow.description:
-        lines.append(f"Description: {workflow.description}")
-
-    if include_topology:
-        try:
-            topology = json.dumps(workflow.to_dict(), sort_keys=True, default=str)
-        except (TypeError, ValueError) as exc:
-            logger.debug("Workflow.to_dict() failed during eval source export: %s", exc)
-            topology = None
-        if topology:
-            lines.append("")
-            lines.append("Topology (JSON):")
-            lines.append(topology)
-
-    agent_executors: list[tuple[str, Any]] = []
-    for executor_id, executor in workflow.executors.items():
-        if isinstance(executor, _AE):
-            agent_executors.append((executor_id, executor.agent))
-
-    if agent_executors:
-        lines.append("")
-        lines.append("Agents:")
-        for executor_id, agent in agent_executors:
-            lines.append("")
-            lines.append(f"Executor: {executor_id}")
-            dossier = _render_agent_dossier(
-                agent,
-                include_instructions=include_instructions,
-                include_tools=include_tools,
-                include_context_providers=include_context_providers,
-                include_examples=False,
-                examples=None,
-            )
-            lines.append(dossier)
-
-    if include_examples and examples:
-        lines.append("")
-        lines.append("Examples:")
-        for idx, example in enumerate(examples, start=1):
-            lines.append(f"{idx}. {example}")
-
-    return "\n".join(lines).strip()
-
-
 # endregion
 
 # region Evaluator protocol
diff --git a/python/packages/core/agent_framework/_workflows/_workflow.py b/python/packages/core/agent_framework/_workflows/_workflow.py
index bce7569ef1a..0493cd015f3 100644
--- a/python/packages/core/agent_framework/_workflows/_workflow.py
+++ b/python/packages/core/agent_framework/_workflows/_workflow.py
@@ -410,55 +410,6 @@ def to_json(self) -> str:
         """Serialize the workflow definition to JSON."""
         return json.dumps(self.to_dict())
 
-    def as_eval_source(
-        self,
-        *,
-        include_instructions: bool = True,
-        include_tools: bool = True,
-        include_context_providers: bool = False,
-        include_examples: bool = False,
-        examples: Sequence[str] | None = None,
-        include_topology: bool = True,
-    ) -> str:
-        """Render this workflow as a textual dossier for rubric-evaluator generation.
-
-        Produces a plain-text dossier containing the workflow's name,
-        description, optional JSON-encoded topology (from
-        :meth:`Workflow.to_dict`), and per-agent dossiers extracted from
-        ``AgentExecutor`` nodes.  Suitable for passing to a rubric
-        generation pipeline (e.g. ``FoundryEvals.generate_rubric``).
-
-        Defaults are conservative: per-agent instructions and tools are
-        included, plus the JSON-encoded topology.  Examples and
-        context-provider class names are excluded by default.
-
-        Keyword Args:
-            include_instructions: Per-agent instructions inclusion.
-            include_tools: Per-agent tool-definition inclusion.
-            include_context_providers: Per-agent context-provider
-                inclusion.
-            include_examples: Whether to include workflow-level
-                ``examples``.
-            examples: Sample queries / interactions to include when
-                ``include_examples`` is true.
-            include_topology: Whether to embed the JSON-encoded workflow
-                topology in the rendered dossier.
-
-        Returns:
-            A plain-text dossier describing the workflow.
-        """
-        from .._evaluation import _render_workflow_dossier  # pyright: ignore[reportPrivateUsage]
-
-        return _render_workflow_dossier(
-            self,
-            include_instructions=include_instructions,
-            include_tools=include_tools,
-            include_context_providers=include_context_providers,
-            include_examples=include_examples,
-            examples=examples,
-            include_topology=include_topology,
-        )
-
     def get_start_executor(self) -> Executor:
         """Get the starting executor of the workflow.
 
diff --git a/python/packages/core/agent_framework/foundry/__init__.py b/python/packages/core/agent_framework/foundry/__init__.py
index 2abd2b58e90..4fe624cc169 100644
--- a/python/packages/core/agent_framework/foundry/__init__.py
+++ b/python/packages/core/agent_framework/foundry/__init__.py
@@ -40,11 +40,8 @@
     "RawFoundryAgentChatClient": ("agent_framework_foundry", "agent-framework-foundry"),
     "RawFoundryChatClient": ("agent_framework_foundry", "agent-framework-foundry"),
     "RawFoundryEmbeddingClient": ("agent_framework_foundry", "agent-framework-foundry"),
-    "RubricDimension": ("agent_framework_foundry", "agent-framework-foundry"),
-    "build_sources": ("agent_framework_foundry", "agent-framework-foundry"),
     "evaluate_foundry_target": ("agent_framework_foundry", "agent-framework-foundry"),
     "evaluate_traces": ("agent_framework_foundry", "agent-framework-foundry"),
-    "load_evals_config": ("agent_framework_foundry", "agent-framework-foundry"),
 }
 
 
diff --git a/python/packages/core/agent_framework/foundry/__init__.pyi b/python/packages/core/agent_framework/foundry/__init__.pyi
index abcf45868f2..145ea48087d 100644
--- a/python/packages/core/agent_framework/foundry/__init__.pyi
+++ b/python/packages/core/agent_framework/foundry/__init__.pyi
@@ -25,11 +25,8 @@ from agent_framework_foundry import (
     RawFoundryAgentChatClient,
     RawFoundryChatClient,
     RawFoundryEmbeddingClient,
-    RubricDimension,
-    build_sources,
     evaluate_foundry_target,
     evaluate_traces,
-    load_evals_config,
 )
 from agent_framework_foundry_local import (
     FoundryLocalChatOptions,
@@ -61,9 +58,6 @@ __all__ = [
     "RawFoundryAgentChatClient",
     "RawFoundryChatClient",
     "RawFoundryEmbeddingClient",
-    "RubricDimension",
-    "build_sources",
     "evaluate_foundry_target",
     "evaluate_traces",
-    "load_evals_config",
 ]
diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py
index e4c37dfb4b4..e60fb35d514 100644
--- a/python/packages/core/tests/core/test_local_eval.py
+++ b/python/packages/core/tests/core/test_local_eval.py
@@ -5,7 +5,6 @@
 from __future__ import annotations
 
 import inspect
-from typing import Any
 
 import pytest
 
@@ -1114,202 +1113,3 @@ def test_evaluator_filter_isolates_offenders(self) -> None:
         )
         # The low-scoring "other" evaluator is filtered out; "policy" passes.
         results.assert_dimension_score_at_least("clarity", 3, evaluator="policy")
-
-
-# ---------------------------------------------------------------------------
-# Eval source rendering (string dossiers)
-# ---------------------------------------------------------------------------
-
-
-class TestAgentAsEvalSource:
-    """Tests for BaseAgent.as_eval_source / _render_agent_dossier."""
-
-    def _make_mock_agent(
-        self,
-        *,
-        name: str = "weather-bot",
-        description: str | None = "Looks up the weather.",
-        instructions: str | None = "Be concise.  Always cite the source.",
-        tools: list[Any] | None = None,
-        context_providers: list[Any] | None = None,
-        mcp_tools: list[Any] | None = None,
-    ) -> Any:
-        from unittest.mock import MagicMock
-
-        from agent_framework._tools import ai_function
-
-        agent = MagicMock()
-        agent.name = name
-        agent.description = description
-        agent.default_options = {"instructions": instructions, "tools": tools or []}
-        agent.context_providers = context_providers or []
-        agent.mcp_tools = mcp_tools or []
-        if tools:
-            normalized: list[Any] = []
-            for t in tools:
-                if callable(t) and not hasattr(t, "parameters"):
-                    normalized.append(ai_function(t))
-                else:
-                    normalized.append(t)
-            agent.default_options["tools"] = normalized
-        return agent
-
-    def _render(self, agent: Any, **overrides: Any) -> str:
-        from agent_framework._evaluation import _render_agent_dossier
-
-        kwargs: dict[str, Any] = {
-            "include_instructions": True,
-            "include_tools": True,
-            "include_context_providers": False,
-            "include_examples": False,
-            "examples": None,
-        }
-        kwargs.update(overrides)
-        return _render_agent_dossier(agent, **kwargs)
-
-    def test_basic_dossier_includes_name_and_instructions(self):
-        agent = self._make_mock_agent()
-        dossier = self._render(agent)
-        assert isinstance(dossier, str)
-        assert "Agent name: weather-bot" in dossier
-        assert "Description: Looks up the weather." in dossier
-        assert "Instructions:" in dossier
-        assert "Be concise." in dossier
-
-    def test_tools_section_includes_definitions(self):
-        def get_weather(city: str) -> str:
-            """Return the current weather for *city*."""
-            return f"sunny in {city}"
-
-        agent = self._make_mock_agent(tools=[get_weather])
-        dossier = self._render(agent)
-        assert "Tools:" in dossier
-        assert "- get_weather" in dossier
-        assert '"city"' in dossier
-
-    def test_include_instructions_false_omits_section(self):
-        agent = self._make_mock_agent()
-        dossier = self._render(agent, include_instructions=False)
-        assert "Instructions:" not in dossier
-
-    def test_include_tools_false_omits_section(self):
-        def get_weather(city: str) -> str:
-            return f"sunny in {city}"
-
-        agent = self._make_mock_agent(tools=[get_weather])
-        dossier = self._render(agent, include_tools=False)
-        assert "Tools:" not in dossier
-
-    def test_context_providers_excluded_by_default_but_included_when_opted_in(self):
-        class StubProvider:
-            pass
-
-        agent = self._make_mock_agent(context_providers=[StubProvider()])
-        default_dossier = self._render(agent)
-        assert "Context providers:" not in default_dossier
-
-        opt_in_dossier = self._render(agent, include_context_providers=True)
-        assert "Context providers:" in opt_in_dossier
-        assert "- StubProvider" in opt_in_dossier
-
-    def test_examples_excluded_by_default_but_included_when_opted_in(self):
-        agent = self._make_mock_agent()
-        default_dossier = self._render(agent, examples=["What's the weather in NYC?"])
-        assert "Examples:" not in default_dossier
-
-        opt_in_dossier = self._render(
-            agent,
-            include_examples=True,
-            examples=["What's the weather in NYC?"],
-        )
-        assert "Examples:" in opt_in_dossier
-        assert "What's the weather in NYC?" in opt_in_dossier
-
-    def test_base_agent_method_returns_dossier_string(self):
-        from agent_framework._agents import BaseAgent
-
-        class _ConcreteAgent(BaseAgent):
-            pass
-
-        agent = _ConcreteAgent(name="test-agent", description="A test agent.")
-        dossier = agent.as_eval_source()
-        assert isinstance(dossier, str)
-        assert "Agent name: test-agent" in dossier
-
-
-class TestWorkflowAsEvalSource:
-    """Tests for Workflow.as_eval_source / _render_workflow_dossier."""
-
-    def _build_workflow(self, *, with_agent: bool = False) -> Any:
-        from unittest.mock import MagicMock
-
-        from agent_framework._workflows._agent_executor import AgentExecutor
-
-        workflow = MagicMock()
-        workflow.name = "demo-workflow"
-        workflow.description = "Routes user questions through a single agent."
-        workflow.to_dict.return_value = {
-            "name": "demo-workflow",
-            "id": "wf_1",
-            "start_executor_id": "agent_1",
-            "edge_groups": [],
-            "executors": {"agent_1": {"type": "AgentExecutor"}},
-        }
-
-        if with_agent:
-            inner_agent = MagicMock()
-            inner_agent.name = "inner-agent"
-            inner_agent.description = "Inner agent."
-            inner_agent.default_options = {"instructions": "Answer politely.", "tools": []}
-            inner_agent.context_providers = []
-            inner_agent.mcp_tools = []
-
-            executor = MagicMock(spec=AgentExecutor)
-            executor.agent = inner_agent
-            workflow.executors = {"agent_1": executor}
-        else:
-            workflow.executors = {}
-        return workflow
-
-    def _render(self, workflow: Any, **overrides: Any) -> str:
-        from agent_framework._evaluation import _render_workflow_dossier
-
-        kwargs: dict[str, Any] = {
-            "include_instructions": True,
-            "include_tools": True,
-            "include_context_providers": False,
-            "include_examples": False,
-            "examples": None,
-            "include_topology": True,
-        }
-        kwargs.update(overrides)
-        return _render_workflow_dossier(workflow, **kwargs)
-
-    def test_emits_dossier_with_topology(self):
-        workflow = self._build_workflow()
-        dossier = self._render(workflow)
-        assert isinstance(dossier, str)
-        assert "Workflow name: demo-workflow" in dossier
-        assert "Topology (JSON):" in dossier
-        assert '"start_executor_id": "agent_1"' in dossier
-
-    def test_topology_can_be_disabled(self):
-        workflow = self._build_workflow()
-        dossier = self._render(workflow, include_topology=False)
-        assert "Topology (JSON):" not in dossier
-
-    def test_per_agent_dossiers_included_when_executor_is_agent_executor(self):
-        workflow = self._build_workflow(with_agent=True)
-        dossier = self._render(workflow)
-        assert "Agents:" in dossier
-        assert "Executor: agent_1" in dossier
-        assert "Agent name: inner-agent" in dossier
-        assert "Answer politely." in dossier
-
-    def test_workflow_examples_excluded_by_default(self):
-        workflow = self._build_workflow()
-        default_dossier = self._render(workflow, examples=["Hi"])
-        assert "Examples:" not in default_dossier
-
-        opt_in_dossier = self._render(workflow, examples=["Hi"], include_examples=True)
-        assert "Examples:" in opt_in_dossier
diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py
index efbe0b8d248..1e40fbc68f6 100644
--- a/python/packages/foundry/agent_framework_foundry/__init__.py
+++ b/python/packages/foundry/agent_framework_foundry/__init__.py
@@ -10,22 +10,11 @@
     FoundryEmbeddingSettings,
     RawFoundryEmbeddingClient,
 )
-from ._evals_config import (
-    RubricGenerationSpec,
-    RubricSourceSpec,
-    build_sources,
-    load_evals_config,
-    parse_evals_config,
-)
 from ._foundry_evals import (
-    EvalGenerationSource,
     FoundryEvals,
     GeneratedEvaluatorRef,
-    RubricDimension,
-    agent_as_eval_source,
     evaluate_foundry_target,
     evaluate_traces,
-    workflow_as_eval_source,
 )
 from ._memory_provider import FoundryMemoryProvider
 
@@ -35,7 +24,6 @@
     __version__ = "0.0.0"
 
 __all__ = [
-    "EvalGenerationSource",
     "FoundryAgent",
     "FoundryAgentOptions",
     "FoundryChatClient",
@@ -50,15 +38,7 @@
     "RawFoundryAgentChatClient",
     "RawFoundryChatClient",
     "RawFoundryEmbeddingClient",
-    "RubricDimension",
-    "RubricGenerationSpec",
-    "RubricSourceSpec",
     "__version__",
-    "agent_as_eval_source",
-    "build_sources",
     "evaluate_foundry_target",
     "evaluate_traces",
-    "load_evals_config",
-    "parse_evals_config",
-    "workflow_as_eval_source",
 ]
diff --git a/python/packages/foundry/agent_framework_foundry/_evals_config.py b/python/packages/foundry/agent_framework_foundry/_evals_config.py
deleted file mode 100644
index 5f45e2854b8..00000000000
--- a/python/packages/foundry/agent_framework_foundry/_evals_config.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright (c) Microsoft. All rights reserved.
-
-"""YAML-driven evaluator configuration for rubric generation and evaluation.
-
-Defines the source-controlled config schema described in
-``adaptive-evals-draft.md``: a list of named rubric-generation specs that
-CI jobs and harnesses parse to drive
-:meth:`FoundryEvals.generate_rubric`.
-
-Example config:
-
-.. code-block:: yaml
-
-    evaluators:
-      reservation-agent-quality:
-        type: foundry.generated_rubric
-        category: quality
-        model: gpt-4o
-        agent: reservation-agent
-        sources:
-          - type: agent
-            include_instructions: true
-            include_tools: true
-          - type: dataset
-            name: reservation-business-rules
-            version: "1"
-
-Example loader usage:
-
-.. code-block:: python
-
-    from agent_framework_foundry import load_evals_config, FoundryEvals
-
-    config = load_evals_config("evaluators.yaml")
-    spec = config["reservation-agent-quality"]
-    sources = build_sources(spec, agent=agent)
-    ref = await FoundryEvals.generate_rubric(
-        project_client=client,
-        name=spec.name,
-        sources=sources,
-        category=spec.category,
-        model=spec.model,
-        display_name=spec.display_name,
-        description=spec.description,
-    )
-"""
-
-from __future__ import annotations
-
-import os
-from collections.abc import Mapping
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any, Literal, cast
-
-from agent_framework._feature_stage import ExperimentalFeature, experimental
-
-from ._foundry_evals import (
-    EvalGenerationSource,
-    agent_as_eval_source,
-    workflow_as_eval_source,
-)
-
-_RUBRIC_TYPE = "foundry.generated_rubric"
-
-
-@experimental(feature_id=ExperimentalFeature.EVALS)
-@dataclass(frozen=True)
-class RubricSourceSpec:
-    """A single source entry in a :class:`RubricGenerationSpec` ``sources`` list.
-
-    Mirrors the per-source YAML schema.  The :attr:`type` field is the
-    discriminator; only the fields relevant to each type are read.
-
-    Attributes:
-        type: One of ``"agent"``, ``"workflow"``, ``"prompt"``,
-            ``"dataset"``, ``"traces"``.
-        description: Optional description shown in Foundry UI.
-        include_instructions: Whether to include the bound agent /
-            workflow's instructions.  Applies to ``"agent"`` and
-            ``"workflow"`` types.
-        include_tools: Whether to include the bound agent / workflow's
-            tools.  Applies to ``"agent"`` and ``"workflow"`` types.
-        include_context_providers: Whether to include attached
-            context-provider class names.  Applies to ``"agent"`` and
-            ``"workflow"`` types.
-        include_examples: Whether to include ``examples``.  Applies to
-            ``"agent"`` and ``"workflow"`` types.
-        include_topology: Whether to include the JSON-encoded topology.
-            Applies to ``"workflow"`` type.
-        examples: Optional list of example queries for ``"agent"`` /
-            ``"workflow"`` sources.
-        prompt: Rendered dossier for ``"prompt"`` type.
-        agent_name: Hosted Foundry agent name for ``"agent"`` type with
-            a server-side reference.
-        name: Dataset name for ``"dataset"`` type.
-        version: Pinned dataset version.
-        metadata: Free-form metadata for ``"traces"`` sources.
-    """
-
-    type: Literal["agent", "workflow", "prompt", "dataset", "traces"]
-    description: str | None = None
-    include_instructions: bool = True
-    include_tools: bool = True
-    include_context_providers: bool = False
-    include_examples: bool = False
-    include_topology: bool = True
-    examples: tuple[str, ...] = field(default_factory=tuple)
-    prompt: str | None = None
-    agent_name: str | None = None
-    name: str | None = None
-    version: str | None = None
-    metadata: dict[str, Any] | None = None
-
-
-@experimental(feature_id=ExperimentalFeature.EVALS)
-@dataclass(frozen=True)
-class RubricGenerationSpec:
-    """A single named entry from an evaluators YAML config.
-
-    Attributes:
-        name: Evaluator name (the YAML key under ``evaluators``).
-        type: Discriminator literal.  Must be
-            ``"foundry.generated_rubric"`` for rubric evaluators.
-        category: ``"quality"`` or ``"safety"``.
-        model: Optional model deployment to drive generation.
-        agent: Optional symbolic reference to the agent in the
-            caller's harness.  Resolved by user code into a
-            :class:`BaseAgent` and passed to
-            :func:`build_sources`.
-        workflow: Optional symbolic reference to a workflow.
-        display_name: Optional human-readable name.
-        description: Optional description.
-        sources: List of source specs to feed into generation.  When
-            empty, callers typically default to a single
-            ``RubricSourceSpec(type='agent')`` or
-            ``RubricSourceSpec(type='workflow')`` source.
-    """
-
-    name: str
-    type: str = _RUBRIC_TYPE
-    category: Literal["quality", "safety"] = "quality"
-    model: str | None = None
-    agent: str | None = None
-    workflow: str | None = None
-    display_name: str | None = None
-    description: str | None = None
-    sources: tuple[RubricSourceSpec, ...] = field(default_factory=tuple)
-
-
-@experimental(feature_id=ExperimentalFeature.EVALS)
-def load_evals_config(path: str | os.PathLike[str]) -> dict[str, RubricGenerationSpec]:
-    """Load a YAML evaluators config and return a name -> spec mapping.
-
-    Reads ``path`` (UTF-8) and parses the top-level ``evaluators``
-    mapping into :class:`RubricGenerationSpec` instances keyed by name.
-
-    Requires ``PyYAML``.  Raises :class:`ImportError` with a helpful
-    message when PyYAML is not installed.
-
-    Args:
-        path: Filesystem path to the YAML config.
-
-    Returns:
-        A dict mapping evaluator name to :class:`RubricGenerationSpec`.
-
-    Raises:
-        ImportError: If PyYAML is not installed.
-        ValueError: If the YAML file is malformed.
-    """
-    try:
-        import yaml  # type: ignore[import-untyped]
-    except ImportError as exc:
-        raise ImportError("load_evals_config requires PyYAML.  Install with `pip install pyyaml`.") from exc
-
-    raw = yaml.safe_load(Path(path).read_text(encoding="utf-8"))
-    return parse_evals_config(raw)
-
-
-@experimental(feature_id=ExperimentalFeature.EVALS)
-def parse_evals_config(data: Any) -> dict[str, RubricGenerationSpec]:
-    """Parse an already-loaded YAML mapping into rubric-generation specs.
-
-    Useful when callers manage YAML loading themselves (e.g. CI that
-    interpolates env vars before parsing).
-
-    Args:
-        data: A mapping with an ``"evaluators"`` key containing a mapping
-            of evaluator names to spec dicts.
-
-    Returns:
-        A dict mapping evaluator name to :class:`RubricGenerationSpec`.
-
-    Raises:
-        ValueError: If the structure is malformed.
-    """
-    if not isinstance(data, Mapping):
-        raise ValueError("Evaluators config must be a mapping.")
-    data_map = cast("Mapping[str, Any]", data)
-    raw_evaluators = data_map.get("evaluators")
-    if raw_evaluators is None:
-        raise ValueError("Evaluators config is missing a top-level 'evaluators' key.")
-    if not isinstance(raw_evaluators, Mapping):
-        raise ValueError("Evaluators config 'evaluators' entry must be a mapping.")
-    evaluators = cast("Mapping[str, Any]", raw_evaluators)
-
-    parsed: dict[str, RubricGenerationSpec] = {}
-    for name, raw in evaluators.items():
-        if not isinstance(raw, Mapping):
-            raise ValueError(f"Evaluator entry {name!r} must be a mapping, got {type(raw).__name__}.")
-        raw_map = cast("Mapping[str, Any]", raw)
-        parsed[name] = _parse_spec(name, raw_map)
-    return parsed
-
-
-def _parse_spec(name: str, raw: Mapping[str, Any]) -> RubricGenerationSpec:
-    type_value = raw.get("type", _RUBRIC_TYPE)
-    if type_value != _RUBRIC_TYPE:
-        raise ValueError(f"Evaluator {name!r} has unsupported type {type_value!r}; expected {_RUBRIC_TYPE!r}.")
-    category = raw.get("category", "quality")
-    if category not in ("quality", "safety"):
-        raise ValueError(f"Evaluator {name!r} has invalid category {category!r}; expected 'quality' or 'safety'.")
-
-    raw_sources_obj: Any = raw.get("sources") or ()
-    if not isinstance(raw_sources_obj, (list, tuple)):
-        raise ValueError(f"Evaluator {name!r} 'sources' must be a list.")
-    sources_iter: list[Any] = list(cast("Any", raw_sources_obj))
-    sources: list[RubricSourceSpec] = []
-    for index, raw_source in enumerate(sources_iter):
-        if not isinstance(raw_source, Mapping):
-            raise ValueError(
-                f"Evaluator {name!r} source entry {index} must be a mapping, got {type(raw_source).__name__}."
-            )
-        sources.append(_parse_source(name, index, cast("Mapping[str, Any]", raw_source)))
-
-    return RubricGenerationSpec(
-        name=name,
-        type=type_value,
-        category=category,
-        model=raw.get("model"),
-        agent=raw.get("agent"),
-        workflow=raw.get("workflow"),
-        display_name=raw.get("display_name"),
-        description=raw.get("description"),
-        sources=tuple(sources),
-    )
-
-
-def _parse_source(spec_name: str, index: int, raw: Mapping[str, Any]) -> RubricSourceSpec:
-    type_value = raw.get("type")
-    if type_value not in ("agent", "workflow", "prompt", "dataset", "traces"):
-        raise ValueError(
-            f"Evaluator {spec_name!r} source {index} has invalid type {type_value!r}; "
-            "expected one of 'agent', 'workflow', 'prompt', 'dataset', 'traces'."
-        )
-
-    examples_raw: Any = raw.get("examples") or ()
-    if not isinstance(examples_raw, (list, tuple)):
-        raise ValueError(f"Evaluator {spec_name!r} source {index} 'examples' must be a list.")
-    examples_iter: list[Any] = list(cast("Any", examples_raw))
-    examples = tuple(str(e) for e in examples_iter)
-
-    metadata_raw = raw.get("metadata")
-    if metadata_raw is not None and not isinstance(metadata_raw, Mapping):
-        raise ValueError(f"Evaluator {spec_name!r} source {index} 'metadata' must be a mapping.")
-
-    return RubricSourceSpec(
-        type=cast("Any", type_value),
-        description=raw.get("description"),
-        include_instructions=bool(raw.get("include_instructions", True)),
-        include_tools=bool(raw.get("include_tools", True)),
-        include_context_providers=bool(raw.get("include_context_providers", False)),
-        include_examples=bool(raw.get("include_examples", False)),
-        include_topology=bool(raw.get("include_topology", True)),
-        examples=examples,
-        prompt=raw.get("prompt"),
-        agent_name=raw.get("agent_name"),
-        name=raw.get("name"),
-        version=str(raw.get("version")) if raw.get("version") is not None else None,
-        metadata=dict(cast("Mapping[str, Any]", metadata_raw)) if metadata_raw is not None else None,
-    )
-
-
-@experimental(feature_id=ExperimentalFeature.EVALS)
-def build_sources(
-    spec: RubricGenerationSpec,
-    *,
-    agent: Any | None = None,
-    workflow: Any | None = None,
-) -> list[EvalGenerationSource]:
-    """Translate a spec's source list into :class:`EvalGenerationSource` instances.
-
-    Resolves each :class:`RubricSourceSpec` against the supplied
-    ``agent`` and ``workflow`` instances:
-
-    * ``type='agent'`` sources call :func:`agent_as_eval_source` with
-      the spec's include-flags.  If the source carries an
-      ``agent_name`` the agent is referenced server-side instead.
-    * ``type='workflow'`` sources call
-      :func:`workflow_as_eval_source` with the spec's include-flags.
-    * ``type='prompt'``, ``type='dataset'``, and ``type='traces'``
-      sources are translated directly into
-      :class:`EvalGenerationSource` instances without consulting the
-      runtime agent or workflow.
-
-    When the spec has no ``sources`` entries, defaults to a single
-    ``type='agent'`` source when an ``agent`` is provided, or a single
-    ``type='workflow'`` source when a ``workflow`` is provided.
-
-    Args:
-        spec: Parsed :class:`RubricGenerationSpec`.
-        agent: Optional agent instance for ``type='agent'`` sources.
-        workflow: Optional workflow instance for ``type='workflow'``
-            sources.
-
-    Returns:
-        A list of :class:`EvalGenerationSource` instances ready to pass
-        to :meth:`FoundryEvals.generate_rubric` as ``sources=``.
-
-    Raises:
-        ValueError: If a source references an agent or workflow that
-            was not supplied.
-    """
-    if not spec.sources:
-        if agent is not None:
-            return [agent_as_eval_source(agent)]
-        if workflow is not None:
-            return [workflow_as_eval_source(workflow)]
-        raise ValueError(f"Spec {spec.name!r} has no sources and no agent/workflow was provided to build_sources().")
-
-    out: list[EvalGenerationSource] = []
-    for src in spec.sources:
-        if src.type == "agent":
-            if src.agent_name:
-                out.append(
-                    EvalGenerationSource(
-                        type="agent",
-                        agent_name=src.agent_name,
-                        description=src.description,
-                    )
-                )
-                continue
-            if agent is None:
-                raise ValueError(f"Spec {spec.name!r} has a source of type 'agent' but no agent= was provided.")
-            out.append(
-                agent_as_eval_source(
-                    agent,
-                    include_instructions=src.include_instructions,
-                    include_tools=src.include_tools,
-                    include_context_providers=src.include_context_providers,
-                    include_examples=src.include_examples,
-                    examples=list(src.examples) if src.examples else None,
-                )
-            )
-        elif src.type == "workflow":
-            if workflow is None:
-                raise ValueError(f"Spec {spec.name!r} has a source of type 'workflow' but no workflow= was provided.")
-            out.append(
-                workflow_as_eval_source(
-                    workflow,
-                    include_instructions=src.include_instructions,
-                    include_tools=src.include_tools,
-                    include_context_providers=src.include_context_providers,
-                    include_examples=src.include_examples,
-                    examples=list(src.examples) if src.examples else None,
-                    include_topology=src.include_topology,
-                )
-            )
-        elif src.type == "prompt":
-            if not src.prompt:
-                raise ValueError(f"Spec {spec.name!r} has a 'prompt' source missing the 'prompt' field.")
-            out.append(EvalGenerationSource(type="prompt", prompt=src.prompt, description=src.description))
-        elif src.type == "dataset":
-            if not src.name:
-                raise ValueError(f"Spec {spec.name!r} has a 'dataset' source missing the 'name' field.")
-            out.append(
-                EvalGenerationSource(
-                    type="dataset",
-                    dataset_name=src.name,
-                    dataset_version=src.version,
-                    description=src.description,
-                )
-            )
-        elif src.type == "traces":
-            out.append(
-                EvalGenerationSource(
-                    type="traces",
-                    description=src.description,
-                    metadata=src.metadata,
-                )
-            )
-        else:  # pragma: no cover - guarded by _parse_source
-            raise ValueError(f"Spec {spec.name!r} has unknown source type {src.type!r}.")
-    return out
-
-
-__all__ = [
-    "RubricGenerationSpec",
-    "RubricSourceSpec",
-    "build_sources",
-    "load_evals_config",
-    "parse_evals_config",
-]
diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index 7fddd64c38c..f242db06d91 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -30,7 +30,7 @@
 import logging
 from collections.abc import Iterable, Sequence
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Literal, cast
+from typing import TYPE_CHECKING, Any, cast
 
 from agent_framework._evaluation import (
     AgentEvalConverter,
@@ -48,332 +48,56 @@
 from ._chat_client import FoundryChatClient
 
 if TYPE_CHECKING:
-    from agent_framework._agents import BaseAgent
-    from agent_framework._workflows._workflow import Workflow
     from azure.ai.projects.aio import AIProjectClient
     from openai.types.evals import RunRetrieveResponse
 
 logger = logging.getLogger(__name__)
 
 
-# region Generated rubric evaluator types
-
-
-@experimental(feature_id=ExperimentalFeature.EVALS)
-@dataclass(frozen=True)
-class RubricDimension:
-    """A single dimension of a generated rubric evaluator.
-
-    Rubric evaluators score each item along one or more named dimensions,
-    each with its own description and weight.  Foundry's evaluator
-    generation pipeline produces these dimensions from agent/workflow
-    metadata; ``RubricDimension`` surfaces them so callers can inspect a
-    generated evaluator's structure without round-tripping through the
-    portal.
-
-    Attributes:
-        id: Stable identifier for the dimension (e.g. ``"policy_enforcement"``).
-        description: Natural-language description of what the dimension scores.
-        weight: Integer weight controlling the dimension's contribution to
-            the aggregate score.
-        always_applicable: When ``False``, evaluators may mark this
-            dimension non-applicable on a per-item basis.
-    """
-
-    id: str
-    description: str
-    weight: int
-    always_applicable: bool = False
+# region Generated rubric evaluator references
 
 
 @experimental(feature_id=ExperimentalFeature.EVALS)
 @dataclass(frozen=True)
 class GeneratedEvaluatorRef:
-    """A reference to a generated rubric evaluator stored in Foundry.
+    """A reference to a rubric evaluator that already exists in Foundry.
 
     Pass instances of this class to :class:`FoundryEvals` to score items
-    with a previously generated rubric evaluator.  Construct directly
-    when the evaluator already exists, or obtain one from
-    :meth:`FoundryEvals.generate_rubric`.
+    with a pre-existing rubric evaluator (manually authored or
+    auto-generated through the Foundry portal).  agent-framework is a
+    consumer here: it does not create or modify the evaluator definition;
+    it only references the persisted version by name.
 
     Pinning ``version`` is strongly recommended so evaluation runs are
-    reproducible.  The dataclass accepts ``version=None`` for the
-    convenience of :meth:`latest`, but ``FoundryEvals`` emits a warning
-    whenever a versionless reference is used; CI gates should always
-    pass a concrete version.
+    reproducible.  ``version=None`` resolves to whichever version is
+    current at execution time; :class:`FoundryEvals` emits a warning when
+    a versionless reference is used.  CI gates should always pass a
+    concrete version.
 
     Attributes:
-        name: Evaluator name as stored in the Foundry project (e.g.
-            ``"my-policy-evaluator"``).  Distinct from built-in
-            evaluators such as ``"builtin.relevance"``.
+        name: Evaluator name as stored in the Foundry project (for
+            example ``"reservation-policy-rubric"``).  Distinct from
+            built-in evaluators such as ``"builtin.relevance"``.
         version: Pinned evaluator version.  ``None`` means "latest" —
-            this is discouraged for CI/repro and ``FoundryEvals`` will
-            emit a warning when used.
-        category: ``"quality"`` for ungrounded rubric scoring,
-            ``"safety"`` for safety-focused evaluators.  Matches the
-            Foundry evaluator's declared category.
+            this is discouraged for CI/repro and :class:`FoundryEvals`
+            will emit a warning when used.
         display_name: Optional human-readable name used in result
             summaries.  Defaults to ``name`` when unset.
-        description: Optional description carried over from the
-            generated evaluator definition for documentation.
-        dimensions: Optional snapshot of the rubric's dimensions for
-            inspection.  Not required to invoke the evaluator — the
-            service uses the persisted definition.
-        pass_threshold: Optional aggregate score threshold (0.0-1.0) the
-            evaluator considers a passing item.  ``None`` defers to the
-            evaluator's stored default.
     """
 
     name: str
     version: str | None = None
-    category: Literal["quality", "safety"] = "quality"
     display_name: str | None = None
-    description: str | None = None
-    dimensions: tuple[RubricDimension, ...] | None = None
-    pass_threshold: float | None = None
 
     @classmethod
-    def latest(
-        cls,
-        name: str,
-        *,
-        category: Literal["quality", "safety"] = "quality",
-        display_name: str | None = None,
-        description: str | None = None,
-    ) -> GeneratedEvaluatorRef:
+    def latest(cls, name: str, *, display_name: str | None = None) -> GeneratedEvaluatorRef:
         """Construct a versionless reference (resolves to the latest version at run time).
 
         Discouraged for reproducible runs.  Prefer the constructor with
         an explicit ``version`` so CI and replay evaluations stay stable
-        when the evaluator is regenerated.
+        when the evaluator is updated in Foundry.
         """
-        return cls(
-            name=name,
-            version=None,
-            category=category,
-            display_name=display_name,
-            description=description,
-        )
-
-
-@experimental(feature_id=ExperimentalFeature.EVALS)
-@dataclass(frozen=True)
-class EvalGenerationSource:
-    """A source description passed to Foundry's evaluator generation pipeline.
-
-    Rubric evaluator generation consumes one or more sources that describe
-    the agent or workflow under evaluation.  ``FoundryEvals`` translates
-    instances into the underlying ``*EvaluatorGenerationJobSource`` SDK
-    types.
-
-    Discriminated by :attr:`type`:
-
-    * ``"prompt"`` - a free-form textual dossier (typical for local agents
-      and workflows whose tools cannot be fetched server-side).
-    * ``"agent"`` - a hosted Foundry agent referenced by name so the
-      service fetches tool definitions and metadata directly.
-    * ``"dataset"`` - a Foundry dataset of recorded interactions.
-    * ``"traces"`` - tracing data scoped by metadata.
-
-    Only the fields relevant to :attr:`type` are populated; the remaining
-    fields stay ``None``.
-
-    Attributes:
-        type: Source kind.  See discriminator above.
-        description: Optional short description shown in Foundry UI.
-        prompt: Rendered dossier for ``type="prompt"`` sources.
-        agent_name: Hosted Foundry agent name for ``type="agent"`` sources.
-        agent_version: Optional pinned hosted-agent version for
-            ``type="agent"`` sources.  ``None`` resolves to the latest
-            version at generation time; pin for reproducible runs.
-        dataset_name: Foundry dataset name for ``type="dataset"`` sources.
-        dataset_version: Pinned dataset version (recommended for repro).
-        metadata: Free-form metadata.  Used by ``type="traces"`` sources
-            for tracing-attribute filters and as a generic escape hatch
-            for additional fields not yet modeled.
-    """
-
-    type: Literal["prompt", "dataset", "agent", "traces"]
-    description: str | None = None
-    prompt: str | None = None
-    agent_name: str | None = None
-    agent_version: str | None = None
-    dataset_name: str | None = None
-    dataset_version: str | None = None
-    metadata: dict[str, Any] | None = None
-
-
-@experimental(feature_id=ExperimentalFeature.EVALS)
-def agent_as_eval_source(
-    agent: BaseAgent,
-    *,
-    include_instructions: bool = True,
-    include_tools: bool = True,
-    include_context_providers: bool = False,
-    include_examples: bool = False,
-    examples: Sequence[str] | None = None,
-    hosted_agent_name: str | None = None,
-    hosted_agent_version: str | None = None,
-    force_prompt_source: bool = False,
-) -> EvalGenerationSource:
-    """Render an agent as an :class:`EvalGenerationSource` for rubric generation.
-
-    Picks the best Foundry source variant for the supplied agent:
-
-    * **Hosted Foundry agents** (``FoundryAgent`` connected to a Prompt
-      Agent or Hosted Agent in a Foundry project) are emitted as
-      ``type="agent"`` sources keyed by ``agent_name`` so the service
-      fetches instructions, tools, and metadata directly from the agent
-      registry — independent of whatever the local wrapper happens to
-      hold.  Detected automatically from ``agent.chat_client.agent_name``
-      and ``agent.chat_client.agent_version``.
-    * **Local agents** (any other ``BaseAgent`` whose instructions and
-      tools live client-side, e.g. ``FoundryChatClient``-backed agents or
-      pure OpenAI Responses agents) are emitted as ``type="prompt"``
-      sources with a rendered text dossier.
-
-    Override the heuristic by passing ``hosted_agent_name`` explicitly
-    (forces an ``"agent"`` source) or ``force_prompt_source=True``
-    (forces a ``"prompt"`` source — useful when you want the service to
-    score a hosted agent against the *local* wrapper's overrides).
-
-    Args:
-        agent: Agent instance (typically a ``BaseAgent`` subclass).
-        include_instructions: Whether to include the agent's instructions
-            text in the dossier (``"prompt"`` sources only).  Defaults to
-            ``True``.
-        include_tools: Whether to include tool definitions in the dossier
-            (``"prompt"`` sources only).  Defaults to ``True``.
-        include_context_providers: Whether to include the names of
-            attached context-provider classes in the dossier
-            (``"prompt"`` sources only).  Defaults to ``False`` to avoid
-            leaking implementation details.
-        include_examples: Whether to include the supplied ``examples`` in
-            the dossier (``"prompt"`` sources only).  Defaults to
-            ``False`` to avoid shipping potentially sensitive sample
-            inputs by default.
-        examples: Optional sample queries / interactions to include when
-            ``include_examples`` is ``True``.
-        hosted_agent_name: When set, emit a ``type="agent"`` source
-            referencing this hosted Foundry agent name regardless of
-            auto-detection.  Use to override or supplement the
-            heuristic.
-        hosted_agent_version: When set together with a hosted-agent
-            source, pins the source to a specific hosted-agent version.
-            Recommended for reproducible rubric generation against
-            PromptAgents.
-        force_prompt_source: When ``True``, always emit a
-            ``type="prompt"`` source with the rendered dossier even when
-            the agent is a hosted Foundry agent.  Useful when the local
-            wrapper holds overrides the service-side agent doesn't see.
-
-    Returns:
-        An :class:`EvalGenerationSource` describing the agent.
-    """
-    agent_description = getattr(agent, "description", None)
-
-    resolved_name = hosted_agent_name
-    resolved_version = hosted_agent_version
-    if resolved_name is None and not force_prompt_source:
-        detected_name, detected_version = _detect_hosted_foundry_agent(agent)
-        if detected_name is not None:
-            resolved_name = detected_name
-            if resolved_version is None:
-                resolved_version = detected_version
-
-    if resolved_name is not None and not force_prompt_source:
-        return EvalGenerationSource(
-            type="agent",
-            agent_name=resolved_name,
-            agent_version=resolved_version,
-            description=agent_description,
-        )
-
-    prompt = agent.as_eval_source(
-        include_instructions=include_instructions,
-        include_tools=include_tools,
-        include_context_providers=include_context_providers,
-        include_examples=include_examples,
-        examples=examples,
-    )
-    return EvalGenerationSource(
-        type="prompt",
-        prompt=prompt,
-        description=agent_description,
-    )
-
-
-def _detect_hosted_foundry_agent(agent: BaseAgent) -> tuple[str | None, str | None]:
-    """Return ``(agent_name, agent_version)`` for hosted Foundry agents, else ``(None, None)``.
-
-    A hosted Foundry agent is one whose ``chat_client`` exposes a string
-    ``agent_name`` — the convention used by ``RawFoundryAgentChatClient``
-    when ``FoundryAgent`` connects to an existing Prompt Agent or Hosted
-    Agent in a Foundry project.  Only string values are accepted so
-    test doubles using ``MagicMock`` for ``chat_client`` are not
-    mis-detected.
-    """
-    chat_client = getattr(agent, "chat_client", None)
-    if chat_client is None:
-        return None, None
-    name = getattr(chat_client, "agent_name", None)
-    version = getattr(chat_client, "agent_version", None)
-    if not isinstance(name, str) or not name:
-        return None, None
-    if not isinstance(version, str) or not version:
-        version = None
-    return name, version
-
-
-@experimental(feature_id=ExperimentalFeature.EVALS)
-def workflow_as_eval_source(
-    workflow: Workflow,
-    *,
-    include_instructions: bool = True,
-    include_tools: bool = True,
-    include_context_providers: bool = False,
-    include_examples: bool = False,
-    examples: Sequence[str] | None = None,
-    include_topology: bool = True,
-) -> EvalGenerationSource:
-    """Render a workflow as an :class:`EvalGenerationSource` for rubric generation.
-
-    Wraps :meth:`Workflow.as_eval_source` to package the workflow's
-    rendered dossier (workflow name, description, topology, per-agent
-    dossiers) into a typed ``type="prompt"`` Foundry generation source.
-
-    Args:
-        workflow: Workflow instance to render.
-        include_instructions: Per-agent instructions inclusion.
-        include_tools: Per-agent tools inclusion.
-        include_context_providers: Per-agent context-provider inclusion.
-            Defaults to ``False``.
-        include_examples: Per-agent examples inclusion.  Defaults to
-            ``False``.
-        examples: Optional workflow-level sample queries.  Rendered into
-            a top-level ``Examples:`` section when ``include_examples`` is
-            ``True``.
-        include_topology: Whether to embed the JSON-encoded workflow
-            topology produced by :meth:`Workflow.to_dict`.  Defaults to
-            ``True``.
-
-    Returns:
-        A ``type="prompt"`` :class:`EvalGenerationSource` describing the
-        workflow.
-    """
-    prompt = workflow.as_eval_source(
-        include_instructions=include_instructions,
-        include_tools=include_tools,
-        include_context_providers=include_context_providers,
-        include_examples=include_examples,
-        examples=examples,
-        include_topology=include_topology,
-    )
-    return EvalGenerationSource(
-        type="prompt",
-        prompt=prompt,
-        description=workflow.description,
-    )
+        return cls(name=name, version=None, display_name=display_name)
 
 
 # endregion
@@ -1237,561 +961,6 @@ async def _evaluate_via_dataset(
             provider=self.name,
         )
 
-    @classmethod
-    @experimental(feature_id=ExperimentalFeature.EVALS)
-    async def generate_rubric(
-        cls,
-        *,
-        project_client: AIProjectClient,
-        name: str,
-        agent: BaseAgent | None = None,
-        workflow: Workflow | None = None,
-        sources: Sequence[EvalGenerationSource] | None = None,
-        category: Literal["quality", "safety"] = "quality",
-        model: str | None = None,
-        display_name: str | None = None,
-        description: str | None = None,
-        operation_id: str | None = None,
-        poll_interval: float = 5.0,
-        timeout: float = 600.0,
-    ) -> GeneratedEvaluatorRef:
-        """Generate a Foundry rubric evaluator from an agent or workflow.
-
-        Drives the Foundry evaluator-generation long-running operation
-        (``client.beta.evaluators.create_generation_job``) end-to-end and
-        returns a pinned :class:`GeneratedEvaluatorRef` for use with
-        :class:`FoundryEvals` ``evaluators=`` lists.
-
-        Exactly one of ``agent``, ``workflow``, or ``sources`` must be
-        supplied.  When ``agent`` or ``workflow`` is given,
-        :func:`agent_as_eval_source` / :func:`workflow_as_eval_source` is
-        used to build a single conservative source (instructions and
-        tools included; examples and context providers excluded).  Pass
-        ``sources=`` directly to control inclusion explicitly or to
-        provide multiple sources.
-
-        Requires ``azure-ai-projects`` with the rubric-generation APIs
-        (currently ``2.3.0a*`` on the Azure SDK dev feed; tracked for an
-        upcoming PyPI release).  Raises :class:`NotImplementedError` with
-        a clear message when the dependency is unavailable.
-
-        Keyword Args:
-            project_client: Async ``AIProjectClient`` for the target
-                Foundry project.
-            name: Evaluator name to register in the project.  Must be a
-                stable identifier (e.g. ``"policy-enforcement-v1"``).
-            agent: Optional ``BaseAgent`` to derive a source from.
-            workflow: Optional ``Workflow`` to derive a source from.
-            sources: Explicit list of :class:`EvalGenerationSource`
-                instances.  Mutually exclusive with ``agent`` / ``workflow``.
-            category: ``"quality"`` or ``"safety"``.  Defaults to
-                ``"quality"``.
-            model: Optional model deployment to drive generation.  When
-                omitted the service picks a default.
-            display_name: Optional human-readable name for the evaluator.
-            description: Optional description for the evaluator.
-            operation_id: Optional caller-supplied operation id to make
-                the create call idempotent.
-            poll_interval: Seconds between job-status polls.
-            timeout: Maximum seconds to wait for the job to complete.
-
-        Returns:
-            A pinned :class:`GeneratedEvaluatorRef` referring to the
-            newly created evaluator.
-
-        Raises:
-            ValueError: If the source arguments are inconsistent.
-            NotImplementedError: If the installed ``azure-ai-projects``
-                version does not expose the rubric APIs.
-            TimeoutError: If the job does not complete within ``timeout``.
-            RuntimeError: If the generation job ends in a non-succeeded
-                terminal state.
-        """
-        resolved_sources = _coalesce_generation_sources(agent=agent, workflow=workflow, sources=sources)
-
-        if category not in ("quality", "safety"):
-            raise ValueError(f"category must be 'quality' or 'safety', got {category!r}.")
-
-        try:
-            sdk_types = _import_generation_sdk_types()
-        except _RubricSdkUnavailableError as exc:
-            raise NotImplementedError(str(exc)) from exc
-
-        sdk_sources = [_to_sdk_source(s, sdk_types) for s in resolved_sources]
-
-        inputs_kwargs: dict[str, Any] = {
-            "name": name,
-            "category": category,
-            "sources": sdk_sources,
-        }
-        if model is not None:
-            inputs_kwargs["model"] = model
-        if display_name is not None:
-            inputs_kwargs["display_name"] = display_name
-        if description is not None:
-            inputs_kwargs["description"] = description
-
-        inputs = sdk_types.EvaluatorGenerationInputs(**inputs_kwargs)
-        job = sdk_types.EvaluatorGenerationJob(inputs=inputs)
-
-        create_kwargs: dict[str, Any] = {"job": job}
-        if operation_id is not None:
-            create_kwargs["operation_id"] = operation_id
-
-        evaluators_ops = _get_beta_evaluators(project_client)
-        created = await evaluators_ops.create_generation_job(**create_kwargs)
-        completed = await _poll_generation_job(
-            evaluators_ops,
-            created,
-            poll_interval=poll_interval,
-            timeout=timeout,
-        )
-
-        return _generation_job_to_ref(completed, category=category)
-
-    @classmethod
-    @experimental(feature_id=ExperimentalFeature.EVALS)
-    async def create_rubric_evaluator(
-        cls,
-        *,
-        project_client: AIProjectClient,
-        name: str,
-        dimensions: Sequence[RubricDimension],
-        category: Literal["quality", "safety"] = "quality",
-        pass_threshold: float | None = None,
-        display_name: str | None = None,
-        description: str | None = None,
-        tags: dict[str, str] | None = None,
-        metadata: dict[str, str] | None = None,
-    ) -> GeneratedEvaluatorRef:
-        """Register a rubric evaluator from caller-supplied dimensions.
-
-        This is the *manual* counterpart to :meth:`generate_rubric` and
-        maps directly to ``project_client.beta.evaluators.create_version``.
-        Use it to bring a rubric you authored elsewhere (e.g. authored
-        from an agent's local context, ported from another framework, or
-        hand-tuned) into Foundry as a versioned ``EvaluatorVersion``
-        that any subsequent ``evaluators=`` list can reference via the
-        returned :class:`GeneratedEvaluatorRef`.
-
-        The service auto-attaches a non-editable residual dimension
-        (``general_quality`` for ``category="quality"``,
-        ``general_policy_compliance`` for ``"safety"``) — do not include
-        it in ``dimensions``.
-
-        Keyword Args:
-            project_client: Async ``AIProjectClient`` for the target
-                Foundry project.
-            name: Stable evaluator name (e.g.
-                ``"reservation-agent-policy-v1"``). A new version is
-                allocated on each call.
-            dimensions: One or more :class:`RubricDimension` instances
-                describing the scoring blueprint. Each dimension's
-                ``id`` must be unique; ``weight`` must be in ``[1, 10]``.
-            category: ``"quality"`` (default) or ``"safety"``.
-            pass_threshold: Optional aggregate pass threshold on the
-                normalized 0.0-1.0 scale. Defaults to the service-side
-                default of ``0.5`` when omitted.
-            display_name: Optional human-readable name shown in the
-                Foundry portal.
-            description: Optional asset description.
-            tags: Optional asset tags.
-            metadata: Optional free-form metadata persisted with the
-                evaluator definition.
-
-        Returns:
-            A pinned :class:`GeneratedEvaluatorRef` referring to the
-            newly created evaluator version.
-
-        Raises:
-            ValueError: If ``dimensions`` is empty, contains duplicate
-                ids, or contains a weight outside ``[1, 10]``.
-            NotImplementedError: If the installed ``azure-ai-projects``
-                version does not expose the manual rubric APIs.
-        """
-        if category not in ("quality", "safety"):
-            raise ValueError(f"category must be 'quality' or 'safety', got {category!r}.")
-        if pass_threshold is not None and not (0.0 <= pass_threshold <= 1.0):
-            raise ValueError(f"pass_threshold must be in [0.0, 1.0] when set (got {pass_threshold!r}).")
-        if not dimensions:
-            raise ValueError("create_rubric_evaluator requires at least one dimension.")
-
-        try:
-            sdk_types = _import_manual_rubric_sdk_types()
-        except _RubricSdkUnavailableError as exc:
-            raise NotImplementedError(str(exc)) from exc
-
-        sdk_dimensions = _to_sdk_dimensions(dimensions, sdk_types.Dimension)
-        definition_kwargs: dict[str, Any] = {"dimensions": sdk_dimensions}
-        if pass_threshold is not None:
-            definition_kwargs["pass_threshold"] = pass_threshold
-        definition = sdk_types.RubricBasedEvaluatorDefinition(**definition_kwargs)
-
-        version_kwargs: dict[str, Any] = {
-            "evaluator_type": "custom",
-            "categories": [category],
-            "definition": definition,
-        }
-        if display_name is not None:
-            version_kwargs["display_name"] = display_name
-        if description is not None:
-            version_kwargs["description"] = description
-        if tags is not None:
-            version_kwargs["tags"] = tags
-        if metadata is not None:
-            version_kwargs["metadata"] = metadata
-
-        evaluator_version = sdk_types.EvaluatorVersion(**version_kwargs)
-        evaluators_ops = _get_beta_evaluators(project_client)
-        created = await evaluators_ops.create_version(name, evaluator_version=evaluator_version)
-
-        return _evaluator_version_to_ref(created, fallback_name=name, category=category)
-
-
-_TERMINAL_GENERATION_STATUSES: frozenset[str] = frozenset({"succeeded", "failed", "cancelled", "canceled"})
-
-
-class _RubricSdkUnavailableError(Exception):
-    """Raised when azure-ai-projects lacks the rubric-generation APIs."""
-
-
-@dataclass(frozen=True)
-class _GenerationSdkTypes:
-    """Resolved SDK type handles for rubric-evaluator generation."""
-
-    EvaluatorGenerationInputs: Any
-    EvaluatorGenerationJob: Any
-    PromptSource: Any
-    AgentSource: Any | None
-    DatasetSource: Any | None
-    TracesSource: Any | None
-
-
-@dataclass(frozen=True)
-class _ManualRubricSdkTypes:
-    """Resolved SDK type handles for manual rubric-evaluator creation."""
-
-    EvaluatorVersion: Any
-    RubricBasedEvaluatorDefinition: Any
-    Dimension: Any
-
-
-_RUBRIC_SDK_MISSING_MSG = (
-    "FoundryEvals.generate_rubric requires the rubric-evaluator generation APIs "
-    "from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev feed). "
-    "Install a build that exposes "
-    "`azure.ai.projects.models.EvaluatorGenerationInputs` and "
-    "`AIProjectClient.beta.evaluators.create_generation_job`."
-)
-
-
-_MANUAL_RUBRIC_SDK_MISSING_MSG = (
-    "FoundryEvals.create_rubric_evaluator requires the manual rubric-evaluator "
-    "APIs from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev "
-    "feed). Install a build that exposes "
-    "`azure.ai.projects.models.RubricBasedEvaluatorDefinition`, "
-    "`azure.ai.projects.models.Dimension`, and "
-    "`AIProjectClient.beta.evaluators.create_version`."
-)
-
-
-def _import_generation_sdk_types() -> _GenerationSdkTypes:
-    """Lazily resolve the rubric-generation SDK types from azure-ai-projects."""
-    try:
-        from azure.ai.projects import models as _models  # type: ignore[import-not-found]
-    except ImportError as exc:
-        raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG) from exc
-
-    models_mod: Any = _models
-    inputs_cls: Any = getattr(models_mod, "EvaluatorGenerationInputs", None)
-    job_cls: Any = getattr(models_mod, "EvaluatorGenerationJob", None)
-    prompt_cls: Any = getattr(models_mod, "PromptEvaluatorGenerationJobSource", None)
-    if inputs_cls is None or job_cls is None or prompt_cls is None:
-        raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG)
-
-    agent_cls: Any = getattr(models_mod, "AgentEvaluatorGenerationJobSource", None)
-    dataset_cls: Any = getattr(models_mod, "DatasetEvaluatorGenerationJobSource", None)
-    traces_cls: Any = getattr(models_mod, "TracesEvaluatorGenerationJobSource", None)
-
-    return _GenerationSdkTypes(
-        EvaluatorGenerationInputs=inputs_cls,
-        EvaluatorGenerationJob=job_cls,
-        PromptSource=prompt_cls,
-        AgentSource=agent_cls,
-        DatasetSource=dataset_cls,
-        TracesSource=traces_cls,
-    )
-
-
-def _import_manual_rubric_sdk_types() -> _ManualRubricSdkTypes:
-    """Lazily resolve the manual rubric-evaluator SDK types from azure-ai-projects."""
-    try:
-        from azure.ai.projects import models as _models  # type: ignore[import-not-found]
-    except ImportError as exc:
-        raise _RubricSdkUnavailableError(_MANUAL_RUBRIC_SDK_MISSING_MSG) from exc
-
-    models_mod: Any = _models
-    version_cls: Any = getattr(models_mod, "EvaluatorVersion", None)
-    definition_cls: Any = getattr(models_mod, "RubricBasedEvaluatorDefinition", None)
-    dimension_cls: Any = getattr(models_mod, "Dimension", None)
-    if version_cls is None or definition_cls is None or dimension_cls is None:
-        raise _RubricSdkUnavailableError(_MANUAL_RUBRIC_SDK_MISSING_MSG)
-
-    return _ManualRubricSdkTypes(
-        EvaluatorVersion=version_cls,
-        RubricBasedEvaluatorDefinition=definition_cls,
-        Dimension=dimension_cls,
-    )
-
-
-def _to_sdk_dimensions(
-    dimensions: Sequence[RubricDimension],
-    dimension_cls: Any,
-) -> list[Any]:
-    """Translate user-facing ``RubricDimension`` instances to SDK ``Dimension`` models.
-
-    The agent-framework type uses ``id`` (matching the runtime output
-    schema and competing frameworks); the SDK input model uses
-    ``dimension_id`` for the editable identifier.
-    """
-    if not dimensions:
-        raise ValueError("create_rubric_evaluator requires at least one dimension.")
-    seen: set[str] = set()
-    sdk_dims: list[Any] = []
-    for dim in dimensions:
-        if not dim.id:
-            raise ValueError("RubricDimension.id must be a non-empty string.")
-        if not dim.description:
-            raise ValueError(f"RubricDimension(id={dim.id!r}).description must be non-empty.")
-        if not isinstance(dim.weight, int) or not (1 <= dim.weight <= 10):
-            raise ValueError(f"RubricDimension(id={dim.id!r}).weight must be an int in [1, 10] (got {dim.weight!r}).")
-        if dim.id in seen:
-            raise ValueError(f"Duplicate RubricDimension.id={dim.id!r}; ids must be unique within a rubric.")
-        seen.add(dim.id)
-        kwargs: dict[str, Any] = {
-            "dimension_id": dim.id,
-            "description": dim.description,
-            "weight": dim.weight,
-        }
-        if dim.always_applicable:
-            kwargs["always_applicable"] = True
-        sdk_dims.append(dimension_cls(**kwargs))
-    return sdk_dims
-
-
-def _evaluator_version_to_ref(
-    created: Any,
-    *,
-    fallback_name: str,
-    category: Literal["quality", "safety"],
-) -> GeneratedEvaluatorRef:
-    """Translate a persisted ``EvaluatorVersion`` to a :class:`GeneratedEvaluatorRef`.
-
-    Used by both the generation-job path and the manual ``create_version``
-    path so callers see a uniform pinned reference regardless of how the
-    evaluator was authored.
-    """
-    ev_name = getattr(created, "name", None) or fallback_name
-    ev_version = getattr(created, "version", None)
-    if ev_version is None:
-        raise RuntimeError("Created evaluator version is missing a version identifier.")
-
-    definition: Any = getattr(created, "definition", None)
-    dimensions: tuple[RubricDimension, ...] | None = None
-    raw_dims: Any = getattr(definition, "dimensions", None) if definition is not None else None
-    if raw_dims:
-        parsed: list[RubricDimension] = []
-        for entry in raw_dims:
-            dim_id = getattr(entry, "dimension_id", None) or getattr(entry, "id", None)
-            try:
-                parsed.append(
-                    RubricDimension(
-                        id=str(dim_id or ""),
-                        description=str(getattr(entry, "description", "") or ""),
-                        weight=int(getattr(entry, "weight", 0) or 0),
-                        always_applicable=bool(getattr(entry, "always_applicable", False)),
-                    )
-                )
-            except (TypeError, ValueError):
-                logger.debug("Skipping malformed dimension on persisted evaluator", exc_info=True)
-        if parsed:
-            dimensions = tuple(parsed)
-
-    pass_threshold: float | None = None
-    raw_threshold: Any = getattr(definition, "pass_threshold", None) if definition is not None else None
-    if isinstance(raw_threshold, (int, float)):
-        pass_threshold = float(raw_threshold)
-
-    return GeneratedEvaluatorRef(
-        name=str(ev_name),
-        version=str(ev_version),
-        category=category,
-        display_name=getattr(created, "display_name", None),
-        description=getattr(created, "description", None),
-        dimensions=dimensions,
-        pass_threshold=pass_threshold,
-    )
-
-
-def _get_beta_evaluators(project_client: AIProjectClient) -> Any:
-    """Return the ``project_client.beta.evaluators`` operations group, or raise."""
-    beta = getattr(project_client, "beta", None)
-    evaluators_ops = getattr(beta, "evaluators", None) if beta is not None else None
-    if evaluators_ops is None:
-        raise NotImplementedError(_RUBRIC_SDK_MISSING_MSG)
-    return evaluators_ops
-
-
-def _coalesce_generation_sources(
-    *,
-    agent: BaseAgent | None,
-    workflow: Workflow | None,
-    sources: Sequence[EvalGenerationSource] | None,
-) -> list[EvalGenerationSource]:
-    if sources is not None and not sources:
-        raise ValueError("sources= must contain at least one EvalGenerationSource.")
-    supplied = [bool(agent), bool(workflow), bool(sources)]
-    if sum(supplied) == 0:
-        raise ValueError("Provide one of agent=, workflow=, or sources=.")
-    if sum(supplied) > 1:
-        raise ValueError("Provide only one of agent=, workflow=, or sources=.")
-    if sources is not None:
-        return list(sources)
-    if agent is not None:
-        return [agent_as_eval_source(agent)]
-    if workflow is None:
-        raise ValueError("workflow= must be provided when agent= and sources= are not set.")
-    return [workflow_as_eval_source(workflow)]
-
-
-def _to_sdk_source(source: EvalGenerationSource, sdk_types: _GenerationSdkTypes) -> Any:
-    """Translate an :class:`EvalGenerationSource` to its SDK counterpart."""
-    if source.type == "prompt":
-        if not source.prompt:
-            raise ValueError("EvalGenerationSource(type='prompt') requires a non-empty prompt.")
-        kwargs: dict[str, Any] = {"prompt": source.prompt}
-        if source.description is not None:
-            kwargs["description"] = source.description
-        return sdk_types.PromptSource(**kwargs)
-    if source.type == "agent":
-        if sdk_types.AgentSource is None:
-            raise NotImplementedError("Installed azure-ai-projects does not expose AgentEvaluatorGenerationJobSource.")
-        if not source.agent_name:
-            raise ValueError("EvalGenerationSource(type='agent') requires agent_name.")
-        kwargs = {"agent_name": source.agent_name}
-        if source.agent_version is not None:
-            kwargs["agent_version"] = source.agent_version
-        if source.description is not None:
-            kwargs["description"] = source.description
-        return sdk_types.AgentSource(**kwargs)
-    if source.type == "dataset":
-        if sdk_types.DatasetSource is None:
-            raise NotImplementedError(
-                "Installed azure-ai-projects does not expose DatasetEvaluatorGenerationJobSource."
-            )
-        if not source.dataset_name:
-            raise ValueError("EvalGenerationSource(type='dataset') requires dataset_name.")
-        # SDK uses ``name`` / ``version`` (not ``dataset_name`` / ``dataset_version``).
-        kwargs = {"name": source.dataset_name}
-        if source.dataset_version is not None:
-            kwargs["version"] = source.dataset_version
-        if source.description is not None:
-            kwargs["description"] = source.description
-        return sdk_types.DatasetSource(**kwargs)
-    if source.type == "traces":
-        if sdk_types.TracesSource is None:
-            raise NotImplementedError("Installed azure-ai-projects does not expose TracesEvaluatorGenerationJobSource.")
-        kwargs = {}
-        if source.metadata is not None:
-            kwargs["metadata"] = source.metadata
-        if source.description is not None:
-            kwargs["description"] = source.description
-        return sdk_types.TracesSource(**kwargs)
-    raise ValueError(f"Unknown EvalGenerationSource type: {source.type!r}")
-
-
-async def _poll_generation_job(
-    evaluators_ops: Any,
-    job: Any,
-    *,
-    poll_interval: float,
-    timeout: float,
-) -> Any:
-    """Poll a rubric-generation job until it reaches a terminal state."""
-    job_id = getattr(job, "id", None)
-    if not job_id:
-        raise RuntimeError("Rubric generation job did not return an id.")
-
-    loop = asyncio.get_running_loop()
-    deadline = loop.time() + timeout
-    current = job
-    while True:
-        status = (getattr(current, "status", "") or "").lower()
-        if status in _TERMINAL_GENERATION_STATUSES:
-            if status != "succeeded":
-                err = getattr(current, "error", None)
-                err_msg = getattr(err, "message", None) or str(err) if err is not None else status
-                raise RuntimeError(f"Rubric generation job {job_id} ended in status {status!r}: {err_msg}")
-            return current
-        remaining = deadline - loop.time()
-        if remaining <= 0:
-            raise TimeoutError(
-                f"Rubric generation job {job_id} did not complete within {timeout}s (last status: {status!r})."
-            )
-        await asyncio.sleep(min(poll_interval, remaining))
-        current = await evaluators_ops.get_generation_job(job_id)
-
-
-def _generation_job_to_ref(job: Any, *, category: Literal["quality", "safety"]) -> GeneratedEvaluatorRef:
-    """Build a pinned :class:`GeneratedEvaluatorRef` from a completed job."""
-    artifacts: Any = getattr(job, "artifacts", None)
-    evaluator: Any = getattr(artifacts, "evaluator", None) if artifacts is not None else None
-    if evaluator is None:
-        raise RuntimeError("Rubric generation job completed without an evaluator artifact.")
-
-    ev_name = getattr(evaluator, "name", None)
-    ev_version = getattr(evaluator, "version", None)
-    if not ev_name:
-        raise RuntimeError("Generated evaluator artifact is missing a name.")
-    if ev_version is None:
-        raise RuntimeError("Generated evaluator artifact is missing a version.")
-
-    definition: Any = getattr(evaluator, "definition", None)
-    dimensions_raw: Any = getattr(definition, "dimensions", None) if definition is not None else None
-    dimensions: tuple[RubricDimension, ...] | None = None
-    if dimensions_raw:
-        parsed: list[RubricDimension] = []
-        for entry in dimensions_raw:
-            try:
-                parsed.append(
-                    RubricDimension(
-                        id=str(getattr(entry, "id", "") or ""),
-                        description=str(getattr(entry, "description", "") or ""),
-                        weight=int(getattr(entry, "weight", 0) or 0),
-                        always_applicable=bool(getattr(entry, "always_applicable", False)),
-                    )
-                )
-            except (TypeError, ValueError):
-                logger.debug("Skipping malformed dimension on generated evaluator", exc_info=True)
-        if parsed:
-            dimensions = tuple(parsed)
-
-    pass_threshold: float | None = None
-    if definition is not None:
-        raw_threshold = getattr(definition, "pass_threshold", None)
-        if isinstance(raw_threshold, (int, float)):
-            pass_threshold = float(raw_threshold)
-
-    return GeneratedEvaluatorRef(
-        name=str(ev_name),
-        version=str(ev_version),
-        category=category,
-        display_name=getattr(evaluator, "display_name", None),
-        description=getattr(evaluator, "description", None),
-        dimensions=dimensions,
-        pass_threshold=pass_threshold,
-    )
-
 
 # ---------------------------------------------------------------------------
 # Foundry-specific functions (not part of the Evaluator protocol)
diff --git a/python/packages/foundry/tests/test_evals_config.py b/python/packages/foundry/tests/test_evals_config.py
deleted file mode 100644
index a1c86187d47..00000000000
--- a/python/packages/foundry/tests/test_evals_config.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Copyright (c) Microsoft. All rights reserved.
-
-"""Tests for the YAML-driven evaluator configuration loader."""
-
-from __future__ import annotations
-
-import textwrap
-from pathlib import Path
-from typing import Any
-from unittest.mock import MagicMock
-
-import pytest
-
-from agent_framework_foundry._evals_config import (
-    RubricGenerationSpec,
-    RubricSourceSpec,
-    build_sources,
-    load_evals_config,
-    parse_evals_config,
-)
-from agent_framework_foundry._foundry_evals import EvalGenerationSource
-
-
-def _make_agent(name: str = "agent-a", instructions: str = "Be brief.") -> Any:
-    from agent_framework._evaluation import _render_agent_dossier
-
-    agent = MagicMock()
-    agent.name = name
-    agent.description = f"{name} description"
-    agent.default_options = {"instructions": instructions, "tools": []}
-    agent.context_providers = []
-    agent.mcp_tools = []
-    agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier(
-        agent,
-        include_instructions=kw.get("include_instructions", True),
-        include_tools=kw.get("include_tools", True),
-        include_context_providers=kw.get("include_context_providers", False),
-        include_examples=kw.get("include_examples", False),
-        examples=kw.get("examples"),
-    )
-    return agent
-
-
-def _make_workflow() -> Any:
-    from agent_framework._evaluation import _render_workflow_dossier
-
-    workflow = MagicMock()
-    workflow.name = "wf-1"
-    workflow.description = "demo"
-    workflow.to_dict.return_value = {"name": "wf-1", "id": "wf_1", "executors": {}, "edge_groups": []}
-    workflow.executors = {}
-    workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier(
-        workflow,
-        include_instructions=kw.get("include_instructions", True),
-        include_tools=kw.get("include_tools", True),
-        include_context_providers=kw.get("include_context_providers", False),
-        include_examples=kw.get("include_examples", False),
-        examples=kw.get("examples"),
-        include_topology=kw.get("include_topology", True),
-    )
-    return workflow
-
-
-class TestParseEvalsConfig:
-    """Parsing already-loaded dicts into RubricGenerationSpec instances."""
-
-    def test_minimal_spec(self) -> None:
-        config = parse_evals_config({
-            "evaluators": {
-                "my-rubric": {
-                    "type": "foundry.generated_rubric",
-                }
-            }
-        })
-        assert "my-rubric" in config
-        spec = config["my-rubric"]
-        assert spec.name == "my-rubric"
-        assert spec.type == "foundry.generated_rubric"
-        assert spec.category == "quality"
-        assert spec.sources == ()
-
-    def test_full_spec_with_sources(self) -> None:
-        config = parse_evals_config({
-            "evaluators": {
-                "reservation-quality": {
-                    "type": "foundry.generated_rubric",
-                    "category": "quality",
-                    "model": "gpt-4o",
-                    "agent": "reservation-agent",
-                    "display_name": "Reservation Quality",
-                    "description": "Custom rubric for reservation agent.",
-                    "sources": [
-                        {
-                            "type": "agent",
-                            "include_instructions": True,
-                            "include_tools": True,
-                            "include_context_providers": True,
-                        },
-                        {
-                            "type": "dataset",
-                            "name": "reservation-business-rules",
-                            "version": 1,
-                        },
-                    ],
-                }
-            }
-        })
-        spec = config["reservation-quality"]
-        assert spec.model == "gpt-4o"
-        assert spec.agent == "reservation-agent"
-        assert spec.display_name == "Reservation Quality"
-        assert len(spec.sources) == 2
-
-        agent_src = spec.sources[0]
-        assert agent_src.type == "agent"
-        assert agent_src.include_context_providers is True
-
-        dataset_src = spec.sources[1]
-        assert dataset_src.type == "dataset"
-        assert dataset_src.name == "reservation-business-rules"
-        assert dataset_src.version == "1"  # coerced to string
-
-    def test_rejects_non_mapping(self) -> None:
-        with pytest.raises(ValueError, match="must be a mapping"):
-            parse_evals_config([])
-
-    def test_rejects_missing_evaluators_key(self) -> None:
-        with pytest.raises(ValueError, match="evaluators"):
-            parse_evals_config({"other": {}})
-
-    def test_rejects_unknown_type(self) -> None:
-        with pytest.raises(ValueError, match="unsupported type"):
-            parse_evals_config({"evaluators": {"x": {"type": "foundry.other"}}})
-
-    def test_rejects_invalid_category(self) -> None:
-        with pytest.raises(ValueError, match="invalid category"):
-            parse_evals_config({"evaluators": {"x": {"type": "foundry.generated_rubric", "category": "bogus"}}})
-
-    def test_rejects_invalid_source_type(self) -> None:
-        with pytest.raises(ValueError, match="invalid type"):
-            parse_evals_config({
-                "evaluators": {
-                    "x": {
-                        "type": "foundry.generated_rubric",
-                        "sources": [{"type": "bogus"}],
-                    }
-                }
-            })
-
-
-class TestLoadEvalsConfig:
-    """End-to-end YAML loading."""
-
-    def test_load_from_yaml_file(self, tmp_path: Path) -> None:
-        pytest.importorskip("yaml")
-        config_path = tmp_path / "evals.yaml"
-        config_path.write_text(
-            textwrap.dedent(
-                """\
-                evaluators:
-                  my-eval:
-                    type: foundry.generated_rubric
-                    category: safety
-                    model: gpt-4o-mini
-                    sources:
-                      - type: prompt
-                        prompt: "Score the response."
-                """
-            ),
-            encoding="utf-8",
-        )
-        config = load_evals_config(config_path)
-        assert "my-eval" in config
-        spec = config["my-eval"]
-        assert spec.category == "safety"
-        assert spec.model == "gpt-4o-mini"
-        assert len(spec.sources) == 1
-        assert spec.sources[0].type == "prompt"
-        assert spec.sources[0].prompt == "Score the response."
-
-
-class TestBuildSources:
-    """Translate RubricGenerationSpec sources into EvalGenerationSource instances."""
-
-    def test_no_sources_with_agent_default(self) -> None:
-        spec = RubricGenerationSpec(name="x")
-        agent = _make_agent()
-        sources = build_sources(spec, agent=agent)
-        assert len(sources) == 1
-        assert sources[0].type == "prompt"
-        assert sources[0].prompt is not None
-        assert "Agent name: agent-a" in sources[0].prompt
-
-    def test_no_sources_with_workflow_default(self) -> None:
-        spec = RubricGenerationSpec(name="x")
-        workflow = _make_workflow()
-        sources = build_sources(spec, workflow=workflow)
-        assert len(sources) == 1
-        assert sources[0].type == "prompt"
-        assert sources[0].prompt is not None
-        assert "Workflow name: wf-1" in sources[0].prompt
-
-    def test_no_sources_no_agent_or_workflow_raises(self) -> None:
-        spec = RubricGenerationSpec(name="x")
-        with pytest.raises(ValueError, match="no sources"):
-            build_sources(spec)
-
-    def test_agent_source_uses_supplied_agent(self) -> None:
-        spec = RubricGenerationSpec(
-            name="x",
-            sources=(RubricSourceSpec(type="agent", include_context_providers=True),),
-        )
-        agent = _make_agent()
-        sources = build_sources(spec, agent=agent)
-        assert sources[0].type == "prompt"
-        assert sources[0].prompt is not None
-        assert "Agent name: agent-a" in sources[0].prompt
-
-    def test_agent_source_with_agent_name_uses_hosted_path(self) -> None:
-        spec = RubricGenerationSpec(
-            name="x",
-            sources=(RubricSourceSpec(type="agent", agent_name="hosted-foundry-agent"),),
-        )
-        sources = build_sources(spec)
-        assert sources[0].type == "agent"
-        assert sources[0].agent_name == "hosted-foundry-agent"
-
-    def test_agent_source_without_agent_raises(self) -> None:
-        spec = RubricGenerationSpec(
-            name="x",
-            sources=(RubricSourceSpec(type="agent"),),
-        )
-        with pytest.raises(ValueError, match="no agent="):
-            build_sources(spec)
-
-    def test_workflow_source_uses_supplied_workflow(self) -> None:
-        spec = RubricGenerationSpec(
-            name="x",
-            sources=(RubricSourceSpec(type="workflow", include_topology=False),),
-        )
-        workflow = _make_workflow()
-        sources = build_sources(spec, workflow=workflow)
-        assert sources[0].type == "prompt"
-        assert sources[0].prompt is not None
-        assert "Workflow name: wf-1" in sources[0].prompt
-        assert "Topology (JSON):" not in sources[0].prompt
-
-    def test_prompt_source_translates_directly(self) -> None:
-        spec = RubricGenerationSpec(
-            name="x",
-            sources=(RubricSourceSpec(type="prompt", prompt="Score it."),),
-        )
-        sources = build_sources(spec)
-        assert sources[0] == EvalGenerationSource(type="prompt", prompt="Score it.")
-
-    def test_dataset_source_translates(self) -> None:
-        spec = RubricGenerationSpec(
-            name="x",
-            sources=(RubricSourceSpec(type="dataset", name="ds", version="2"),),
-        )
-        sources = build_sources(spec)
-        assert sources[0].type == "dataset"
-        assert sources[0].dataset_name == "ds"
-        assert sources[0].dataset_version == "2"
-
-    def test_traces_source_passes_metadata(self) -> None:
-        spec = RubricGenerationSpec(
-            name="x",
-            sources=(RubricSourceSpec(type="traces", metadata={"environment": "prod"}),),
-        )
-        sources = build_sources(spec)
-        assert sources[0].type == "traces"
-        assert sources[0].metadata == {"environment": "prod"}
diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index d24c528a744..df8627352bb 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -6,7 +6,7 @@
 
 import json
 from dataclasses import dataclass
-from typing import Any, cast
+from typing import Any
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
@@ -27,7 +27,6 @@
 
 from agent_framework_foundry._foundry_evals import (
     FoundryEvals,
-    RubricDimension,
     _build_item_schema,
     _build_testing_criteria,
     _extract_per_evaluator,
@@ -65,32 +64,6 @@ def _make_tool(name: str) -> MagicMock:
     return t
 
 
-def _make_stub_agent(
-    *,
-    name: str = "alpha",
-    description: str = "An agent.",
-    instructions: str = "Be brief.",
-) -> MagicMock:
-    """Mock agent whose as_eval_source returns a real dossier string."""
-    from agent_framework._evaluation import _render_agent_dossier
-
-    agent = MagicMock()
-    agent.name = name
-    agent.description = description
-    agent.default_options = {"instructions": instructions, "tools": []}
-    agent.context_providers = []
-    agent.mcp_tools = []
-    agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier(
-        agent,
-        include_instructions=kw.get("include_instructions", True),
-        include_tools=kw.get("include_tools", True),
-        include_context_providers=kw.get("include_context_providers", False),
-        include_examples=kw.get("include_examples", False),
-        examples=kw.get("examples"),
-    )
-    return agent
-
-
 @dataclass
 class _MockResultCounts:
     """Mock matching the OpenAI SDK ResultCounts Pydantic model shape."""
@@ -3045,729 +3018,3 @@ async def test_target_without_type_raises(self) -> None:
                 client=mock_client,
                 model="gpt-4o",
             )
-
-
-class TestFoundryAgentAsEvalSource:
-    """Tests for foundry's agent_as_eval_source helper (wraps BaseAgent.as_eval_source)."""
-
-    def test_returns_prompt_source_with_dossier(self) -> None:
-        from agent_framework_foundry._foundry_evals import agent_as_eval_source
-
-        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
-        source = agent_as_eval_source(agent)
-        assert source.type == "prompt"
-        assert source.description == "Looks up the weather."
-        assert source.prompt is not None
-        assert "Agent name: weather-bot" in source.prompt
-        assert "Be brief." in source.prompt
-
-    def test_hosted_agent_name_emits_agent_source(self) -> None:
-        from agent_framework_foundry._foundry_evals import agent_as_eval_source
-
-        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
-        source = agent_as_eval_source(agent, hosted_agent_name="weather-bot-hosted-id")
-        assert source.type == "agent"
-        assert source.agent_name == "weather-bot-hosted-id"
-        assert source.prompt is None
-        assert source.description == "Looks up the weather."
-
-    def test_explicit_hosted_agent_version_forwarded(self) -> None:
-        from agent_framework_foundry._foundry_evals import agent_as_eval_source
-
-        agent = _make_stub_agent(name="weather-bot")
-        source = agent_as_eval_source(
-            agent,
-            hosted_agent_name="weather-bot-hosted-id",
-            hosted_agent_version="3",
-        )
-        assert source.type == "agent"
-        assert source.agent_name == "weather-bot-hosted-id"
-        assert source.agent_version == "3"
-
-    def test_auto_detects_hosted_foundry_agent(self) -> None:
-        """A chat_client carrying agent_name/agent_version is treated as a hosted agent."""
-        from agent_framework_foundry._foundry_evals import agent_as_eval_source
-
-        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
-        agent.chat_client = MagicMock()
-        agent.chat_client.agent_name = "weather-prompt-agent"
-        agent.chat_client.agent_version = "2"
-
-        source = agent_as_eval_source(agent)
-        assert source.type == "agent"
-        assert source.agent_name == "weather-prompt-agent"
-        assert source.agent_version == "2"
-        assert source.prompt is None
-        assert source.description == "Looks up the weather."
-
-    def test_auto_detection_handles_versionless_hosted_agent(self) -> None:
-        """HostedAgents typically omit agent_version (no None forwarded)."""
-        from agent_framework_foundry._foundry_evals import agent_as_eval_source
-
-        agent = _make_stub_agent(name="weather-bot")
-        agent.chat_client = MagicMock()
-        agent.chat_client.agent_name = "weather-hosted-agent"
-        agent.chat_client.agent_version = None
-
-        source = agent_as_eval_source(agent)
-        assert source.type == "agent"
-        assert source.agent_name == "weather-hosted-agent"
-        assert source.agent_version is None
-
-    def test_force_prompt_source_overrides_auto_detection(self) -> None:
-        """force_prompt_source=True falls back to dossier even for hosted agents."""
-        from agent_framework_foundry._foundry_evals import agent_as_eval_source
-
-        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
-        agent.chat_client = MagicMock()
-        agent.chat_client.agent_name = "weather-prompt-agent"
-        agent.chat_client.agent_version = "2"
-
-        source = agent_as_eval_source(agent, force_prompt_source=True)
-        assert source.type == "prompt"
-        assert source.prompt is not None
-        assert "Agent name: weather-bot" in source.prompt
-
-    def test_auto_detection_ignores_non_string_chat_client_fields(self) -> None:
-        """Bare MagicMock chat_client (untyped attrs) must not trigger detection."""
-        from agent_framework_foundry._foundry_evals import agent_as_eval_source
-
-        agent = _make_stub_agent(name="local-agent")
-        agent.chat_client = MagicMock()  # agent_name attr resolves to a MagicMock, not a str
-
-        source = agent_as_eval_source(agent)
-        assert source.type == "prompt"
-        assert source.prompt is not None
-        assert "Agent name: local-agent" in source.prompt
-
-    def test_forwards_keyword_options_to_agent(self) -> None:
-        from agent_framework_foundry._foundry_evals import agent_as_eval_source
-
-        agent = _make_stub_agent()
-        source = agent_as_eval_source(agent, include_instructions=False)
-        assert source.prompt is not None
-        assert "Instructions:" not in source.prompt
-
-
-class TestFoundryWorkflowAsEvalSource:
-    """Tests for foundry's workflow_as_eval_source helper (wraps Workflow.as_eval_source)."""
-
-    def _make_workflow(self) -> MagicMock:
-        from agent_framework._evaluation import _render_workflow_dossier
-
-        workflow = MagicMock()
-        workflow.name = "demo-workflow"
-        workflow.description = "Routes user questions."
-        workflow.to_dict.return_value = {
-            "name": "demo-workflow",
-            "id": "wf_1",
-            "executors": {},
-            "edge_groups": [],
-        }
-        workflow.executors = {}
-        workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier(
-            workflow,
-            include_instructions=kw.get("include_instructions", True),
-            include_tools=kw.get("include_tools", True),
-            include_context_providers=kw.get("include_context_providers", False),
-            include_examples=kw.get("include_examples", False),
-            examples=kw.get("examples"),
-            include_topology=kw.get("include_topology", True),
-        )
-        return workflow
-
-    def test_returns_prompt_source_with_topology(self) -> None:
-        from agent_framework_foundry._foundry_evals import workflow_as_eval_source
-
-        workflow = self._make_workflow()
-        source = workflow_as_eval_source(workflow)
-        assert source.type == "prompt"
-        assert source.description == "Routes user questions."
-        assert source.prompt is not None
-        assert "Workflow name: demo-workflow" in source.prompt
-        assert "Topology (JSON):" in source.prompt
-
-    def test_topology_can_be_disabled(self) -> None:
-        from agent_framework_foundry._foundry_evals import workflow_as_eval_source
-
-        workflow = self._make_workflow()
-        source = workflow_as_eval_source(workflow, include_topology=False)
-        assert source.prompt is not None
-        assert "Topology (JSON):" not in source.prompt
-
-
-class TestCoalesceGenerationSources:
-    """Validation for the source-resolution helper used by FoundryEvals.generate_rubric."""
-
-    def test_requires_exactly_one_source(self) -> None:
-        from agent_framework_foundry._foundry_evals import _coalesce_generation_sources
-
-        with pytest.raises(ValueError, match="Provide one of"):
-            _coalesce_generation_sources(agent=None, workflow=None, sources=None)
-
-    def test_rejects_multiple_sources(self) -> None:
-        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _coalesce_generation_sources
-
-        agent = MagicMock()
-        agent.name = "a"
-        agent.description = None
-        agent.default_options = {"instructions": "x", "tools": []}
-        agent.context_providers = []
-        agent.mcp_tools = []
-        with pytest.raises(ValueError, match="only one of"):
-            _coalesce_generation_sources(
-                agent=agent,
-                workflow=None,
-                sources=[EvalGenerationSource(type="prompt", prompt="hi")],
-            )
-
-    def test_uses_agent_helper_when_only_agent_supplied(self) -> None:
-        from agent_framework_foundry._foundry_evals import _coalesce_generation_sources
-
-        agent = _make_stub_agent(name="alpha", description="An agent.")
-
-        sources = _coalesce_generation_sources(agent=agent, workflow=None, sources=None)
-        assert len(sources) == 1
-        assert sources[0].type == "prompt"
-        assert sources[0].prompt is not None
-        assert "Agent name: alpha" in sources[0].prompt
-
-    def test_rejects_empty_sources_list(self) -> None:
-        from agent_framework_foundry._foundry_evals import _coalesce_generation_sources
-
-        with pytest.raises(ValueError, match="at least one"):
-            _coalesce_generation_sources(agent=None, workflow=None, sources=[])
-
-
-class TestToSdkSource:
-    """Translation between EvalGenerationSource and SDK *JobSource types."""
-
-    def _make_sdk_types(self, *, with_agent: bool = True, with_dataset: bool = True, with_traces: bool = True) -> Any:
-        from agent_framework_foundry._foundry_evals import _GenerationSdkTypes
-
-        return _GenerationSdkTypes(
-            EvaluatorGenerationInputs=MagicMock(),
-            EvaluatorGenerationJob=MagicMock(),
-            PromptSource=MagicMock(name="PromptSource"),
-            AgentSource=MagicMock(name="AgentSource") if with_agent else None,
-            DatasetSource=MagicMock(name="DatasetSource") if with_dataset else None,
-            TracesSource=MagicMock(name="TracesSource") if with_traces else None,
-        )
-
-    def test_prompt_source_is_translated(self) -> None:
-        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
-
-        sdk = self._make_sdk_types()
-        sdk.PromptSource.return_value = "prompt-sdk-instance"
-        out = _to_sdk_source(
-            EvalGenerationSource(type="prompt", prompt="hello", description="d"),
-            sdk,
-        )
-        assert out == "prompt-sdk-instance"
-        sdk.PromptSource.assert_called_once_with(prompt="hello", description="d")
-
-    def test_prompt_without_text_raises(self) -> None:
-        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
-
-        sdk = self._make_sdk_types()
-        with pytest.raises(ValueError, match="non-empty prompt"):
-            _to_sdk_source(EvalGenerationSource(type="prompt"), sdk)
-
-    def test_agent_source_is_translated(self) -> None:
-        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
-
-        sdk = self._make_sdk_types()
-        sdk.AgentSource.return_value = "agent-sdk-instance"
-        out = _to_sdk_source(
-            EvalGenerationSource(type="agent", agent_name="my-hosted-agent"),
-            sdk,
-        )
-        assert out == "agent-sdk-instance"
-        sdk.AgentSource.assert_called_once_with(agent_name="my-hosted-agent")
-
-    def test_agent_source_requires_name(self) -> None:
-        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
-
-        sdk = self._make_sdk_types()
-        with pytest.raises(ValueError, match="agent_name"):
-            _to_sdk_source(EvalGenerationSource(type="agent"), sdk)
-
-    def test_agent_source_raises_when_sdk_missing(self) -> None:
-        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
-
-        sdk = self._make_sdk_types(with_agent=False)
-        with pytest.raises(NotImplementedError, match="AgentEvaluatorGenerationJobSource"):
-            _to_sdk_source(
-                EvalGenerationSource(type="agent", agent_name="x"),
-                sdk,
-            )
-
-    def test_dataset_source_is_translated(self) -> None:
-        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
-
-        sdk = self._make_sdk_types()
-        sdk.DatasetSource.return_value = "dataset-sdk-instance"
-        out = _to_sdk_source(
-            EvalGenerationSource(type="dataset", dataset_name="ds", dataset_version="1"),
-            sdk,
-        )
-        assert out == "dataset-sdk-instance"
-        sdk.DatasetSource.assert_called_once_with(name="ds", version="1")
-
-    def test_agent_source_forwards_agent_version(self) -> None:
-        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
-
-        sdk = self._make_sdk_types()
-        sdk.AgentSource.return_value = "agent-sdk-instance"
-        out = _to_sdk_source(
-            EvalGenerationSource(type="agent", agent_name="prompt-agent", agent_version="2"),
-            sdk,
-        )
-        assert out == "agent-sdk-instance"
-        sdk.AgentSource.assert_called_once_with(agent_name="prompt-agent", agent_version="2")
-
-
-class TestPollGenerationJob:
-    """Behavior of the rubric-generation polling loop."""
-
-    async def test_returns_immediately_on_succeeded(self) -> None:
-        from agent_framework_foundry._foundry_evals import _poll_generation_job
-
-        evaluators_ops = MagicMock()
-        evaluators_ops.get_generation_job = AsyncMock()
-        job = MagicMock(id="job_1", status="succeeded")
-        out = await _poll_generation_job(evaluators_ops, job, poll_interval=0.01, timeout=1.0)
-        assert out is job
-        evaluators_ops.get_generation_job.assert_not_called()
-
-    async def test_polls_until_terminal(self) -> None:
-        from agent_framework_foundry._foundry_evals import _poll_generation_job
-
-        running = MagicMock(id="job_1", status="running")
-        succeeded = MagicMock(id="job_1", status="succeeded")
-        evaluators_ops = MagicMock()
-        evaluators_ops.get_generation_job = AsyncMock(side_effect=[running, succeeded])
-
-        initial = MagicMock(id="job_1", status="running")
-        out = await _poll_generation_job(evaluators_ops, initial, poll_interval=0.001, timeout=1.0)
-        assert out is succeeded
-        assert evaluators_ops.get_generation_job.await_count == 2
-
-    async def test_failed_status_raises(self) -> None:
-        from agent_framework_foundry._foundry_evals import _poll_generation_job
-
-        err = MagicMock(message="boom")
-        terminal = MagicMock(id="job_1", status="failed", error=err)
-        evaluators_ops = MagicMock()
-        evaluators_ops.get_generation_job = AsyncMock(return_value=terminal)
-
-        with pytest.raises(RuntimeError, match="boom"):
-            await _poll_generation_job(
-                evaluators_ops,
-                MagicMock(id="job_1", status="running"),
-                poll_interval=0.001,
-                timeout=1.0,
-            )
-
-    async def test_timeout_raises(self) -> None:
-        from agent_framework_foundry._foundry_evals import _poll_generation_job
-
-        running = MagicMock(id="job_1", status="running")
-        evaluators_ops = MagicMock()
-        evaluators_ops.get_generation_job = AsyncMock(return_value=running)
-
-        with pytest.raises(TimeoutError):
-            await _poll_generation_job(evaluators_ops, running, poll_interval=0.001, timeout=0.005)
-
-
-class TestGenerationJobToRef:
-    """Translation of a completed generation job to a GeneratedEvaluatorRef."""
-
-    def test_builds_pinned_ref_with_dimensions(self) -> None:
-        from agent_framework_foundry._foundry_evals import RubricDimension, _generation_job_to_ref
-
-        dim = MagicMock(id="d1", description="dim", weight=2, always_applicable=True)
-        definition = MagicMock(dimensions=[dim], pass_threshold=0.75)
-        evaluator = MagicMock(
-            name="my-eval",
-            version=3,
-            display_name="My Eval",
-            description="A custom rubric.",
-            definition=definition,
-        )
-        evaluator.name = "my-eval"
-        job = MagicMock(artifacts=MagicMock(evaluator=evaluator))
-
-        ref = _generation_job_to_ref(job, category="quality")
-        assert ref.name == "my-eval"
-        assert ref.version == "3"
-        assert ref.display_name == "My Eval"
-        assert ref.description == "A custom rubric."
-        assert ref.category == "quality"
-        assert ref.pass_threshold == 0.75
-        assert ref.dimensions is not None
-        assert ref.dimensions[0] == RubricDimension(id="d1", description="dim", weight=2, always_applicable=True)
-
-    def test_missing_artifacts_raises(self) -> None:
-        from agent_framework_foundry._foundry_evals import _generation_job_to_ref
-
-        job = MagicMock(artifacts=None)
-        with pytest.raises(RuntimeError, match="evaluator artifact"):
-            _generation_job_to_ref(job, category="quality")
-
-
-class TestGenerateRubricSdkMissing:
-    """generate_rubric raises NotImplementedError when SDK lacks the rubric APIs."""
-
-    async def test_raises_when_sdk_types_unavailable(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        from agent_framework_foundry import _foundry_evals as fm
-        from agent_framework_foundry._foundry_evals import EvalGenerationSource
-
-        def _raise() -> Any:
-            raise fm._RubricSdkUnavailableError(fm._RUBRIC_SDK_MISSING_MSG)
-
-        monkeypatch.setattr(fm, "_import_generation_sdk_types", _raise)
-
-        project_client = MagicMock()
-
-        with pytest.raises(NotImplementedError, match="rubric"):
-            await FoundryEvals.generate_rubric(
-                project_client=project_client,
-                name="my-eval",
-                sources=[EvalGenerationSource(type="prompt", prompt="hi")],
-            )
-
-    async def test_raises_value_error_on_invalid_category(self) -> None:
-        """category outside {quality, safety} should fail fast at the boundary."""
-        from agent_framework_foundry._foundry_evals import EvalGenerationSource
-
-        project_client = MagicMock()
-
-        with pytest.raises(ValueError, match="category"):
-            await FoundryEvals.generate_rubric(
-                project_client=project_client,
-                name="my-eval",
-                sources=[EvalGenerationSource(type="prompt", prompt="hi")],
-                category=cast("Any", "invalid"),
-            )
-
-
-class TestGenerateRubricE2E:
-    """End-to-end happy path for generate_rubric with mocked SDK."""
-
-    async def test_generate_rubric_from_agent(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        from agent_framework_foundry import _foundry_evals as fm
-
-        # Stub SDK type handles
-        prompt_cls = MagicMock(name="PromptSource")
-        prompt_cls.return_value = "sdk-prompt"
-        inputs_cls = MagicMock(name="EvaluatorGenerationInputs")
-        inputs_cls.return_value = "sdk-inputs"
-        job_cls = MagicMock(name="EvaluatorGenerationJob")
-        job_cls.return_value = "sdk-job"
-
-        sdk_types = fm._GenerationSdkTypes(
-            EvaluatorGenerationInputs=inputs_cls,
-            EvaluatorGenerationJob=job_cls,
-            PromptSource=prompt_cls,
-            AgentSource=None,
-            DatasetSource=None,
-            TracesSource=None,
-        )
-        monkeypatch.setattr(fm, "_import_generation_sdk_types", lambda: sdk_types)
-
-        # Mock the SDK operations and completed job
-        completed_evaluator = MagicMock(version="7", display_name=None, description=None)
-        completed_evaluator.name = "agent-rubric"
-        completed_evaluator.definition = MagicMock(dimensions=[], pass_threshold=None)
-        completed = MagicMock(
-            id="job_42",
-            status="succeeded",
-            artifacts=MagicMock(evaluator=completed_evaluator),
-        )
-
-        evaluators_ops = MagicMock()
-        evaluators_ops.create_generation_job = AsyncMock(return_value=completed)
-        evaluators_ops.get_generation_job = AsyncMock(return_value=completed)
-        project_client = MagicMock()
-        project_client.beta = MagicMock(evaluators=evaluators_ops)
-
-        # Build a stub agent
-        agent = _make_stub_agent(
-            name="weather-bot",
-            description="Looks up weather.",
-            instructions="Be brief.",
-        )
-
-        ref = await FoundryEvals.generate_rubric(
-            project_client=project_client,
-            name="agent-rubric",
-            agent=agent,
-            category="quality",
-            model="gpt-4o",
-            display_name="Display",
-            description="Desc",
-            operation_id="op-123",
-        )
-
-        assert ref.name == "agent-rubric"
-        assert ref.version == "7"
-        assert ref.category == "quality"
-
-        # Verify inputs/job/source assembly
-        prompt_cls.assert_called_once()
-        prompt_kwargs = prompt_cls.call_args.kwargs
-        assert "Agent name: weather-bot" in prompt_kwargs["prompt"]
-        assert prompt_kwargs["description"] == "Looks up weather."
-
-        inputs_cls.assert_called_once()
-        inputs_kwargs = inputs_cls.call_args.kwargs
-        assert inputs_kwargs["name"] == "agent-rubric"
-        assert inputs_kwargs["category"] == "quality"
-        assert inputs_kwargs["model"] == "gpt-4o"
-        assert inputs_kwargs["display_name"] == "Display"
-        assert inputs_kwargs["description"] == "Desc"
-        assert inputs_kwargs["sources"] == ["sdk-prompt"]
-
-        job_cls.assert_called_once_with(inputs="sdk-inputs")
-        evaluators_ops.create_generation_job.assert_awaited_once_with(job="sdk-job", operation_id="op-123")
-
-
-# ---------------------------------------------------------------------------
-# FoundryEvals.create_rubric_evaluator — manual rubric registration
-# ---------------------------------------------------------------------------
-
-
-class TestCreateRubricEvaluatorValidation:
-    """Argument validation for ``FoundryEvals.create_rubric_evaluator``."""
-
-    async def test_rejects_empty_dimensions(self) -> None:
-        with pytest.raises(ValueError, match="at least one dimension"):
-            await FoundryEvals.create_rubric_evaluator(
-                project_client=MagicMock(),
-                name="x",
-                dimensions=[],
-            )
-
-    async def test_rejects_invalid_category(self) -> None:
-        with pytest.raises(ValueError, match="category"):
-            await FoundryEvals.create_rubric_evaluator(
-                project_client=MagicMock(),
-                name="x",
-                dimensions=[RubricDimension(id="a", description="d", weight=5)],
-                category="bogus",  # type: ignore[arg-type]
-            )
-
-    async def test_rejects_out_of_range_pass_threshold(self) -> None:
-        with pytest.raises(ValueError, match="pass_threshold"):
-            await FoundryEvals.create_rubric_evaluator(
-                project_client=MagicMock(),
-                name="x",
-                dimensions=[RubricDimension(id="a", description="d", weight=5)],
-                pass_threshold=1.5,
-            )
-
-    async def test_rejects_duplicate_dimension_ids(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        from agent_framework_foundry import _foundry_evals as fm
-
-        sdk = fm._ManualRubricSdkTypes(
-            EvaluatorVersion=MagicMock(),
-            RubricBasedEvaluatorDefinition=MagicMock(),
-            Dimension=MagicMock(),
-        )
-        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk)
-        with pytest.raises(ValueError, match="Duplicate"):
-            await FoundryEvals.create_rubric_evaluator(
-                project_client=MagicMock(),
-                name="x",
-                dimensions=[
-                    RubricDimension(id="dup", description="d1", weight=5),
-                    RubricDimension(id="dup", description="d2", weight=3),
-                ],
-            )
-
-    async def test_rejects_weight_out_of_range(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        from agent_framework_foundry import _foundry_evals as fm
-
-        sdk = fm._ManualRubricSdkTypes(
-            EvaluatorVersion=MagicMock(),
-            RubricBasedEvaluatorDefinition=MagicMock(),
-            Dimension=MagicMock(),
-        )
-        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk)
-        with pytest.raises(ValueError, match="weight"):
-            await FoundryEvals.create_rubric_evaluator(
-                project_client=MagicMock(),
-                name="x",
-                dimensions=[RubricDimension(id="a", description="d", weight=0)],
-            )
-
-    async def test_rejects_empty_description(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        from agent_framework_foundry import _foundry_evals as fm
-
-        sdk = fm._ManualRubricSdkTypes(
-            EvaluatorVersion=MagicMock(),
-            RubricBasedEvaluatorDefinition=MagicMock(),
-            Dimension=MagicMock(),
-        )
-        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk)
-        with pytest.raises(ValueError, match="description"):
-            await FoundryEvals.create_rubric_evaluator(
-                project_client=MagicMock(),
-                name="x",
-                dimensions=[RubricDimension(id="a", description="", weight=5)],
-            )
-
-
-class TestCreateRubricEvaluatorSdkMissing:
-    async def test_raises_not_implemented_when_sdk_lacks_types(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        from agent_framework_foundry import _foundry_evals as fm
-
-        def _raise() -> Any:
-            raise fm._RubricSdkUnavailableError("nope")
-
-        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", _raise)
-        with pytest.raises(NotImplementedError, match="nope"):
-            await FoundryEvals.create_rubric_evaluator(
-                project_client=MagicMock(),
-                name="x",
-                dimensions=[RubricDimension(id="a", description="d", weight=5)],
-            )
-
-
-class TestCreateRubricEvaluatorE2E:
-    """End-to-end happy path for create_rubric_evaluator with mocked SDK."""
-
-    async def test_calls_create_version_with_rubric_definition(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        from agent_framework_foundry import _foundry_evals as fm
-
-        dimension_cls = MagicMock(name="Dimension", side_effect=lambda **kw: ("dim", kw))
-        definition_cls = MagicMock(name="RubricBasedEvaluatorDefinition", side_effect=lambda **kw: ("def", kw))
-        version_cls = MagicMock(name="EvaluatorVersion", side_effect=lambda **kw: ("ver", kw))
-
-        sdk = fm._ManualRubricSdkTypes(
-            EvaluatorVersion=version_cls,
-            RubricBasedEvaluatorDefinition=definition_cls,
-            Dimension=dimension_cls,
-        )
-        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk)
-
-        created_definition = MagicMock()
-        created_definition.dimensions = [
-            MagicMock(dimension_id="intent", description="d1", weight=9, always_applicable=False),
-            MagicMock(dimension_id="general_quality", description="g", weight=5, always_applicable=True),
-        ]
-        created_definition.pass_threshold = 0.7
-        created_version = MagicMock(
-            display_name="DN",
-            description="hand-authored",
-        )
-        created_version.name = "policy-eval"
-        created_version.version = "3"
-        created_version.definition = created_definition
-
-        evaluators_ops = MagicMock()
-        evaluators_ops.create_version = AsyncMock(return_value=created_version)
-        project_client = MagicMock()
-        project_client.beta = MagicMock(evaluators=evaluators_ops)
-
-        ref = await FoundryEvals.create_rubric_evaluator(
-            project_client=project_client,
-            name="policy-eval",
-            dimensions=[
-                RubricDimension(id="intent", description="d1", weight=9),
-                RubricDimension(id="general_quality", description="g", weight=5, always_applicable=True),
-            ],
-            category="quality",
-            pass_threshold=0.7,
-            display_name="DN",
-            description="hand-authored",
-            tags={"team": "agents"},
-            metadata={"source": "manual"},
-        )
-
-        # Returned ref carries the persisted (name, version) and snapshot of dimensions.
-        assert ref.name == "policy-eval"
-        assert ref.version == "3"
-        assert ref.category == "quality"
-        assert ref.pass_threshold == 0.7
-        assert ref.dimensions is not None
-        assert [d.id for d in ref.dimensions] == ["intent", "general_quality"]
-        assert ref.dimensions[1].always_applicable is True
-
-        # Dimension construction used dimension_id, included always_applicable only when True.
-        assert dimension_cls.call_count == 2
-        first_kwargs = dimension_cls.call_args_list[0].kwargs
-        assert first_kwargs == {"dimension_id": "intent", "description": "d1", "weight": 9}
-        second_kwargs = dimension_cls.call_args_list[1].kwargs
-        assert second_kwargs == {
-            "dimension_id": "general_quality",
-            "description": "g",
-            "weight": 5,
-            "always_applicable": True,
-        }
-
-        # Definition construction forwarded pass_threshold and the two sdk dimensions.
-        definition_cls.assert_called_once()
-        def_kwargs = definition_cls.call_args.kwargs
-        assert def_kwargs["pass_threshold"] == 0.7
-        assert def_kwargs["dimensions"] == [
-            ("dim", {"dimension_id": "intent", "description": "d1", "weight": 9}),
-            (
-                "dim",
-                {
-                    "dimension_id": "general_quality",
-                    "description": "g",
-                    "weight": 5,
-                    "always_applicable": True,
-                },
-            ),
-        ]
-
-        # EvaluatorVersion construction passed evaluator_type="custom", category list, and optionals.
-        version_cls.assert_called_once()
-        ver_kwargs = version_cls.call_args.kwargs
-        assert ver_kwargs["evaluator_type"] == "custom"
-        assert ver_kwargs["categories"] == ["quality"]
-        assert ver_kwargs["display_name"] == "DN"
-        assert ver_kwargs["description"] == "hand-authored"
-        assert ver_kwargs["tags"] == {"team": "agents"}
-        assert ver_kwargs["metadata"] == {"source": "manual"}
-
-        # SDK ops invoked with name + evaluator_version kwarg.
-        evaluators_ops.create_version.assert_awaited_once()
-        call = evaluators_ops.create_version.await_args
-        assert call.args == ("policy-eval",)
-        assert "evaluator_version" in call.kwargs
-
-    async def test_omits_pass_threshold_when_not_set(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        from agent_framework_foundry import _foundry_evals as fm
-
-        dimension_cls = MagicMock(side_effect=lambda **kw: kw)
-        definition_cls = MagicMock(side_effect=lambda **kw: kw)
-        version_cls = MagicMock(side_effect=lambda **kw: kw)
-
-        sdk = fm._ManualRubricSdkTypes(
-            EvaluatorVersion=version_cls,
-            RubricBasedEvaluatorDefinition=definition_cls,
-            Dimension=dimension_cls,
-        )
-        monkeypatch.setattr(fm, "_import_manual_rubric_sdk_types", lambda: sdk)
-
-        created = MagicMock(display_name=None, description=None)
-        created.name = "x"
-        created.version = "1"
-        created.definition = MagicMock(dimensions=[], pass_threshold=None)
-
-        evaluators_ops = MagicMock()
-        evaluators_ops.create_version = AsyncMock(return_value=created)
-        project_client = MagicMock()
-        project_client.beta = MagicMock(evaluators=evaluators_ops)
-
-        ref = await FoundryEvals.create_rubric_evaluator(
-            project_client=project_client,
-            name="x",
-            dimensions=[RubricDimension(id="a", description="d", weight=5)],
-        )
-        assert ref.pass_threshold is None
-        assert "pass_threshold" not in definition_cls.call_args.kwargs
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
index b7f8f7cc1b6..4ef22f6ee66 100644
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
@@ -35,25 +35,31 @@ Evaluate what already happened — zero changes to agent code:
 uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py
 ```
 
-### `evaluate_with_generated_rubric_sample.py` — Auto-Generate a Rubric
-
-Let Foundry draft the rubric dimensions for you from the agent's
-context (instructions, tools, description).  Best when you don't yet
-have a fixed scoring rubric and want a strong baseline you can refine.
-
-```bash
-uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py
+### Referencing a rubric evaluator created in Foundry
+
+Foundry users can create rubric evaluators in the Foundry portal (or
+through the dedicated SDK / REST surface) — see
+[Rubric evaluators](https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/rubric-evaluators)
+for the authoring flow. Once an evaluator exists, agent-framework
+consumes it like any other evaluator: pass a
+`GeneratedEvaluatorRef(name=..., version=...)` in the `evaluators=`
+list and pin the version for reproducible runs.
+
+```python
+from agent_framework.foundry import FoundryEvals, GeneratedEvaluatorRef
+
+evals = FoundryEvals(
+    evaluators=[
+        GeneratedEvaluatorRef(name="reservation-policy-rubric", version="3"),
+        "relevance",
+        "coherence",
+    ],
+)
 ```
 
-### `evaluate_with_manual_rubric_sample.py` — Author a Rubric Yourself
-
-Bring your own `RubricDimension`s (from a spec, a competing framework,
-or hand tuning) and register them as a versioned evaluator.  Use this
-when you already know what you want to score.
-
-```bash
-uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py
-```
+Quality gates on rubric output use the standard `EvalResults` helpers,
+including `assert_dimension_score_at_least(...)` for per-dimension
+thresholds.
 
 ## Setup
 
@@ -64,5 +70,4 @@ Create a `.env` file with configuration as in the `.env.example` file in this fo
 - **"I want to test my agent during development"** → `evaluate_agent_sample.py`, Pattern 1
 - **"I want to evaluate past agent runs"** → `evaluate_traces_sample.py`
 - **"I want to inspect/modify eval data before submitting"** → `evaluate_agent_sample.py`, Pattern 2
-- **"I want Foundry to draft a custom rubric for my agent"** → `evaluate_with_generated_rubric_sample.py`
-- **"I already have a rubric I want to bring into Foundry"** → `evaluate_with_manual_rubric_sample.py`
+- **"I want to score against a custom rubric I created in Foundry"** → pass a `GeneratedEvaluatorRef` (see snippet above)
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py
deleted file mode 100644
index 9c19ff552ba..00000000000
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) Microsoft. All rights reserved.
-
-"""Generate a Foundry rubric evaluator from an agent and use it in CI.
-
-This sample demonstrates the end-to-end adaptive-evals flow:
-
-1. Build an agent.
-2. Generate a rubric evaluator from the agent using
-   ``FoundryEvals.generate_rubric()`` — produces a pinned
-   ``GeneratedEvaluatorRef`` you can store in source control.
-3. Use the pinned reference in ``evaluators=[...]`` for a regression
-   run alongside built-in evaluators.
-4. Assert quality gates with ``assert_score_at_least`` /
-   ``assert_dimension_score_at_least`` / ``assert_no_failed_items``.
-
-A companion ``evaluators.yaml`` shows the source-controlled config
-pattern for CI.  Load it with :func:`load_evals_config` and pass the
-resulting spec through :func:`build_sources` to keep generation
-parameters out of code.
-
-Prerequisites:
-- An Azure AI Foundry project with a deployed model.
-- ``azure-ai-projects`` build that includes the rubric-generation APIs.
-- Set ``FOUNDRY_PROJECT_ENDPOINT`` and ``FOUNDRY_MODEL`` in ``.env``.
-
-Run with:
-
-.. code-block:: bash
-
-    az login
-    python evaluate_with_generated_rubric_sample.py
-"""
-
-import asyncio
-import os
-import textwrap
-from pathlib import Path
-
-from agent_framework import evaluate_agent
-from agent_framework.foundry import (
-    FoundryChatClient,
-    FoundryEvals,
-    build_sources,
-    load_evals_config,
-)
-from azure.ai.projects.aio import AIProjectClient
-from azure.identity.aio import AzureCliCredential
-from dotenv import load_dotenv
-
-load_dotenv()
-
-
-def get_weather(location: str) -> str:
-    """Get the current weather for a location."""
-    samples = {
-        "seattle": "62F, cloudy with a chance of rain",
-        "london": "55F, overcast",
-        "paris": "68F, partly sunny",
-    }
-    return samples.get(location.lower(), f"Weather data not available for {location}")
-
-
-SAMPLE_YAML = textwrap.dedent(
-    """\
-    evaluators:
-      travel-quality:
-        type: foundry.generated_rubric
-        category: quality
-        model: gpt-4o
-        display_name: Travel Quality Rubric
-        description: Custom rubric tailored to the travel-assistant agent.
-        sources:
-          - type: agent
-            include_instructions: true
-            include_tools: true
-    """
-)
-
-
-async def main() -> None:
-    project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
-    model_name = os.environ.get("FOUNDRY_MODEL", "gpt-4o")
-
-    credential = AzureCliCredential()
-    chat_client = FoundryChatClient(
-        project_endpoint=project_endpoint,
-        model=model_name,
-        credential=credential,
-    )
-    project_client = AIProjectClient(endpoint=project_endpoint, credential=credential)
-
-    agent = chat_client.as_agent(
-        name="travel-assistant",
-        instructions=(
-            "You are a helpful travel assistant.  Always ground recommendations in tool output, "
-            "cite each tool result, and refuse questions outside travel planning."
-        ),
-        tools=[get_weather],
-    )
-
-    # 1. Load the source-controlled evaluator config.
-    config_path = Path(__file__).with_name("evaluators.yaml")
-    if not config_path.exists():
-        config_path.write_text(SAMPLE_YAML, encoding="utf-8")
-        print(f"Wrote sample config to {config_path}")
-    config = load_evals_config(config_path)
-    spec = config["travel-quality"]
-
-    # 2. Generate (or refresh) the rubric evaluator.  In CI you typically run
-    # this once and commit the returned name/version pair.
-    print("Generating rubric evaluator from agent + spec...")
-    sources = build_sources(spec, agent=agent)
-    rubric_ref = await FoundryEvals.generate_rubric(
-        project_client=project_client,
-        name=spec.name,
-        sources=sources,
-        category=spec.category,
-        model=spec.model,
-        display_name=spec.display_name,
-        description=spec.description,
-    )
-    print(f"Generated rubric {rubric_ref.name}@{rubric_ref.version} with {len(rubric_ref.dimensions or ())} dimensions")
-
-    # 3. Run an evaluation that combines built-ins with the new rubric.
-    evals = FoundryEvals(
-        client=chat_client,
-        evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY, rubric_ref],
-    )
-    results = await evaluate_agent(
-        agent=agent,
-        queries=[
-            "What's the weather in Seattle?",
-            "Should I pack an umbrella for London?",
-        ],
-        evaluators=evals,
-    )
-
-    # 4. Quality gates — wire these into your CI job's exit status.
-    for r in results:
-        print(f"\nRun {r.run_id}: {r.passed}/{r.total} passed; portal: {r.report_url}")
-        r.assert_no_failed_items()
-        r.assert_score_at_least(0.8)
-        if rubric_ref.dimensions:
-            r.assert_dimension_score_at_least(rubric_ref.dimensions[0].id, 3)
-
-    await project_client.close()
-    await credential.close()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py
deleted file mode 100644
index e1fc86ef71c..00000000000
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_manual_rubric_sample.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) Microsoft. All rights reserved.
-
-"""Register a hand-authored rubric evaluator and use it in CI.
-
-This sample demonstrates the *manual* counterpart to
-``evaluate_with_generated_rubric_sample.py``:
-
-1. Build an agent.
-2. Author the rubric dimensions yourself — useful when you have an
-   established scoring rubric (from a spec, a competing framework, or
-   prior hand tuning) that you want to bring into Foundry as-is.
-3. Register the rubric with
-   :meth:`FoundryEvals.create_rubric_evaluator` — this maps directly to
-   ``project_client.beta.evaluators.create_version`` and returns a
-   pinned ``GeneratedEvaluatorRef`` you can store in source control.
-4. Use the pinned reference in ``evaluators=[...]`` for a regression run
-   alongside built-in evaluators.
-
-The service auto-attaches a non-editable residual dimension
-(``general_quality`` for ``category="quality"``,
-``general_policy_compliance`` for ``"safety"``) — do not include it in
-``dimensions``.
-
-Prefer :meth:`FoundryEvals.generate_rubric` if you want Foundry to
-draft the dimensions for you from the agent's context.  Use this manual
-flow when you already know what you want to score.
-
-Prerequisites:
-- An Azure AI Foundry project with a deployed model.
-- ``azure-ai-projects`` build that includes the rubric APIs (currently
-  ``2.3.0a*`` on the Azure SDK Python dev feed).
-- Set ``FOUNDRY_PROJECT_ENDPOINT`` and ``FOUNDRY_MODEL`` in ``.env``.
-
-Run with:
-
-.. code-block:: bash
-
-    az login
-    python evaluate_with_manual_rubric_sample.py
-"""
-
-import asyncio
-import os
-
-from agent_framework import evaluate_agent
-from agent_framework.foundry import (
-    FoundryChatClient,
-    FoundryEvals,
-    RubricDimension,
-)
-from azure.ai.projects.aio import AIProjectClient
-from azure.identity.aio import AzureCliCredential
-from dotenv import load_dotenv
-
-load_dotenv()
-
-
-def get_weather(location: str) -> str:
-    """Get the current weather for a location."""
-    samples = {
-        "seattle": "62F, cloudy with a chance of rain",
-        "london": "55F, overcast",
-        "paris": "68F, partly sunny",
-    }
-    return samples.get(location.lower(), f"Weather data not available for {location}")
-
-
-# Hand-authored rubric — this is the artifact you commit alongside the
-# agent so the rubric and the behavior it scores evolve together.
-# Weights are 1-10 (the generation pipeline biases one dimension to
-# 8-10; manual edits aren't constrained by this heuristic).
-TRAVEL_RUBRIC_DIMENSIONS: list[RubricDimension] = [
-    RubricDimension(
-        id="tool_grounding",
-        description=(
-            "Grounds every weather claim in tool output.  Does not invent values when "
-            "the tool returns no data, and does not paraphrase tool output in a way "
-            "that distorts the underlying values."
-        ),
-        weight=9,
-    ),
-    RubricDimension(
-        id="scope_adherence",
-        description=(
-            "Stays within travel-planning scope.  Politely declines or redirects "
-            "questions about topics unrelated to travel (e.g. general trivia, "
-            "personal advice, coding questions)."
-        ),
-        weight=6,
-    ),
-    RubricDimension(
-        id="actionable_recommendation",
-        description=(
-            "Provides a clear, actionable recommendation grounded in the tool result "
-            "(e.g. 'Pack an umbrella' when rain is reported), not just a restatement "
-            "of the raw weather data."
-        ),
-        weight=4,
-    ),
-]
-
-
-async def main() -> None:
-    project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
-    model_name = os.environ.get("FOUNDRY_MODEL", "gpt-4o")
-
-    credential = AzureCliCredential()
-    chat_client = FoundryChatClient(
-        project_endpoint=project_endpoint,
-        model=model_name,
-        credential=credential,
-    )
-    project_client = AIProjectClient(endpoint=project_endpoint, credential=credential)
-
-    agent = chat_client.as_agent(
-        name="travel-assistant",
-        instructions=(
-            "You are a helpful travel assistant.  Always ground recommendations in "
-            "tool output, cite each tool result, and refuse questions outside travel "
-            "planning."
-        ),
-        tools=[get_weather],
-    )
-
-    # 1. Register (or bump the version of) the hand-authored rubric.
-    # The service auto-attaches the non-editable `general_quality`
-    # residual dimension for quality rubrics.
-    print("Registering manual rubric evaluator...")
-    rubric_ref = await FoundryEvals.create_rubric_evaluator(
-        project_client=project_client,
-        name="travel-quality-manual",
-        dimensions=TRAVEL_RUBRIC_DIMENSIONS,
-        category="quality",
-        pass_threshold=0.6,
-        display_name="Travel Quality (Manual)",
-        description="Hand-authored rubric for the travel-assistant agent.",
-    )
-    print(
-        f"Registered rubric {rubric_ref.name}@{rubric_ref.version} "
-        f"with {len(rubric_ref.dimensions or ())} dimensions "
-        f"(pass_threshold={rubric_ref.pass_threshold})"
-    )
-
-    # 2. Run an evaluation that combines built-ins with the new rubric.
-    evals = FoundryEvals(
-        client=chat_client,
-        evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY, rubric_ref],
-    )
-    results = await evaluate_agent(
-        agent=agent,
-        queries=[
-            "What's the weather in Seattle?",
-            "Should I pack an umbrella for London?",
-            "What's the capital of France?",  # off-scope — exercises scope_adherence
-        ],
-        evaluators=evals,
-    )
-
-    # 3. Quality gates — wire these into your CI job's exit status.
-    for r in results:
-        print(f"\nRun {r.run_id}: {r.passed}/{r.total} passed; portal: {r.report_url}")
-        r.assert_no_failed_items()
-        r.assert_score_at_least(0.7)
-        r.assert_dimension_score_at_least("tool_grounding", 3)
-        r.assert_dimension_score_at_least("scope_adherence", 3)
-
-    await project_client.close()
-    await credential.close()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml
deleted file mode 100644
index f3e698c77ce..00000000000
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-evaluators:
-  travel-quality:
-    type: foundry.generated_rubric
-    category: quality
-    model: gpt-4o
-    display_name: Travel Quality Rubric
-    description: Custom rubric tailored to the travel-assistant agent.
-    sources:
-      - type: agent
-        include_instructions: true
-        include_tools: true

From 907c9092b91a8465f7248bc955714e52e2673b0c Mon Sep 17 00:00:00 2001
From: alliscode <25218250+alliscode@users.noreply.github.com>
Date: Thu, 28 May 2026 10:16:32 -0700
Subject: [PATCH 13/16] samples(foundry-evals): add evaluate_with_rubric_sample

Adds a runnable end-to-end sample showing how to consume a pre-existing rubric evaluator created in Foundry: reference it with GeneratedEvaluatorRef(name, version), mix it with built-in evaluators in FoundryEvals, and gate CI with assert_dimension_score_at_least on a specific dimension.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../evaluation/foundry_evals/.env.example     |   9 ++
 .../evaluation/foundry_evals/README.md        |   6 +-
 .../evaluate_with_rubric_sample.py            | 138 ++++++++++++++++++
 3 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py

diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example
index b6a8af233e8..388350edea2 100644
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example
@@ -1,3 +1,12 @@
 FOUNDRY_PROJECT_ENDPOINT="<your-project-endpoint>"
 FOUNDRY_MODEL="<your-model-deployment>"
 
+# Only needed for evaluate_with_rubric_sample.py — connects to the
+# pre-existing Foundry agent that the rubric evaluator was created against.
+FOUNDRY_AGENT_NAME="<your-agent-name>"
+FOUNDRY_AGENT_VERSION="<your-agent-version>"
+
+# Only needed for evaluate_with_rubric_sample.py — references a rubric
+# evaluator you created in Foundry. Pin the version for reproducible runs.
+FOUNDRY_RUBRIC_NAME="<your-rubric-name>"
+FOUNDRY_RUBRIC_VERSION="<your-rubric-version>"
\ No newline at end of file
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
index 4ef22f6ee66..2f47c468612 100644
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
@@ -61,6 +61,10 @@ Quality gates on rubric output use the standard `EvalResults` helpers,
 including `assert_dimension_score_at_least(...)` for per-dimension
 thresholds.
 
+See [`evaluate_with_rubric_sample.py`](./evaluate_with_rubric_sample.py)
+for a runnable end-to-end example that combines a rubric evaluator with
+built-in evaluators and gates a per-dimension threshold.
+
 ## Setup
 
 Create a `.env` file with configuration as in the `.env.example` file in this folder.
@@ -70,4 +74,4 @@ Create a `.env` file with configuration as in the `.env.example` file in this fo
 - **"I want to test my agent during development"** → `evaluate_agent_sample.py`, Pattern 1
 - **"I want to evaluate past agent runs"** → `evaluate_traces_sample.py`
 - **"I want to inspect/modify eval data before submitting"** → `evaluate_agent_sample.py`, Pattern 2
-- **"I want to score against a custom rubric I created in Foundry"** → pass a `GeneratedEvaluatorRef` (see snippet above)
+- **"I want to score against a custom rubric I created in Foundry"** → `evaluate_with_rubric_sample.py`
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py
new file mode 100644
index 00000000000..06ec5c9bdd7
--- /dev/null
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_rubric_sample.py
@@ -0,0 +1,138 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Evaluate a Foundry agent against a rubric evaluator that was created in Foundry.
+
+Rubric evaluators are LLM-as-judge evaluators with custom scoring dimensions
+that you define for your domain. agent-framework consumes pre-existing rubric
+evaluators — they are authored in the Foundry portal (or via the dedicated
+SDK / REST surface) and referenced here by name and version.
+
+See: https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/rubric-evaluators
+
+This sample demonstrates:
+1. Connecting to a pre-existing Foundry agent (PromptAgent or HostedAgent).
+2. Referencing a pre-existing rubric evaluator by ``name`` and ``version``.
+3. Mixing the rubric with built-in Foundry evaluators in one run.
+4. Asserting per-dimension thresholds with
+   ``EvalResults.assert_dimension_score_at_least(...)`` for CI quality gates.
+
+Starting condition / prerequisites:
+- An Azure AI Foundry project with a deployed model.
+- A registered Foundry agent (PromptAgent or HostedAgent) in that project.
+  This is the agent the rubric is meant to evaluate.
+- A rubric evaluator already created in the Foundry portal against that
+  agent. Creating rubrics through the portal currently requires picking a
+  Foundry agent as the generation context, so this prerequisite is implied
+  by having a rubric at all.
+- Set the following in .env (see ``.env.example``):
+    - ``FOUNDRY_PROJECT_ENDPOINT``
+    - ``FOUNDRY_AGENT_NAME`` and ``FOUNDRY_AGENT_VERSION`` for the agent
+    - ``FOUNDRY_RUBRIC_NAME`` and ``FOUNDRY_RUBRIC_VERSION`` for the rubric
+    - ``FOUNDRY_MODEL`` for the rubric judge model
+"""
+
+import asyncio
+import os
+
+from agent_framework import EvalNotPassedError, evaluate_agent
+from agent_framework.foundry import FoundryAgent, FoundryChatClient, FoundryEvals, GeneratedEvaluatorRef
+from azure.identity import AzureCliCredential
+from dotenv import load_dotenv
+
+load_dotenv(override=True)
+
+
+async def main() -> None:
+    # 1. Connect to the existing Foundry agent that the rubric was created
+    #    against. PromptAgents and HostedAgents are both supported.
+    credential = AzureCliCredential()
+    project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
+
+    agent = FoundryAgent(
+        project_endpoint=project_endpoint,
+        agent_name=os.environ["FOUNDRY_AGENT_NAME"],
+        agent_version=os.environ.get("FOUNDRY_AGENT_VERSION"),
+        credential=credential,
+    )
+
+    # 2. Reference the pre-existing rubric evaluator by name + version.
+    #    Always pin a version for reproducible CI runs; versionless refs
+    #    resolve to "latest" and emit a warning at evaluation time.
+    rubric_name = os.environ["FOUNDRY_RUBRIC_NAME"]
+    rubric_version = os.environ["FOUNDRY_RUBRIC_VERSION"]
+    rubric = GeneratedEvaluatorRef(name=rubric_name, version=rubric_version)
+
+    # 3. Mix the rubric with built-in evaluators in a single FoundryEvals
+    #    config. FoundryEvals talks to Foundry over the project endpoint, so
+    #    we hand it a FoundryChatClient configured with the same credential.
+    eval_client = FoundryChatClient(
+        project_endpoint=project_endpoint,
+        model=os.environ["FOUNDRY_MODEL"],
+        credential=credential,
+    )
+    evals = FoundryEvals(
+        client=eval_client,
+        evaluators=[
+            rubric,
+            FoundryEvals.RELEVANCE,
+            FoundryEvals.COHERENCE,
+        ],
+    )
+
+    # =========================================================================
+    # Run evaluation
+    # =========================================================================
+    print("=" * 60)
+    print(f"Evaluating '{agent.name}' with rubric '{rubric_name}' (version {rubric_version})")
+    print("=" * 60)
+
+    results = await evaluate_agent(
+        agent=agent,
+        queries=[
+            "What's the weather like in Seattle?",
+            "Should I bring an umbrella to London tomorrow?",
+        ],
+        evaluators=evals,
+    )
+
+    for r in results:
+        print(f"Status: {r.status}")
+        print(f"Results: {r.passed}/{r.total} passed")
+        print(f"Portal: {r.report_url}")
+        if r.all_passed:
+            print("[PASS] All passed")
+        else:
+            print(f"[FAIL] {r.failed} failed")
+
+    # =========================================================================
+    # Per-dimension quality gate
+    # =========================================================================
+    # Rubric evaluators emit per-dimension scores (1–5) on top of the overall
+    # weighted score. Use assert_dimension_score_at_least to gate CI on a
+    # specific dimension — e.g., never ship if a critical dimension drops
+    # below 3.
+    #
+    # The dimension_id must match an id defined on your rubric in Foundry.
+    # ``general_quality`` is used here because it's the conventional
+    # ``always_applicable: true`` dimension in the Foundry docs' example
+    # rubric — swap it for whatever dimension id(s) your rubric actually
+    # defines.
+    print()
+    print("=" * 60)
+    print("Per-dimension quality gate")
+    print("=" * 60)
+
+    for r in results:
+        try:
+            r.assert_dimension_score_at_least(
+                "general_quality",
+                min_score=3.0,
+                evaluator=rubric_name,
+            )
+            print(f"[PASS] {r.provider}: general_quality >= 3 on every item")
+        except EvalNotPassedError as exc:
+            print(f"[FAIL] {r.provider}: dimension gate tripped: {exc}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From b6a558d6e2aa8e0bfd4a722cc991552574bb45a4 Mon Sep 17 00:00:00 2001
From: alliscode <25218250+alliscode@users.noreply.github.com>
Date: Fri, 29 May 2026 08:34:46 -0700
Subject: [PATCH 14/16] fix(foundry-evals): satisfy mypy on _fetch_output_items

mypy infers OutputItemListResponse.sample as dict[str, object] | None while pyright correctly infers the typed Sample model. Cast to Any so both type checkers accept the attribute access pattern, rename the local to avoid shadowing the inner-loop sample binding, and drop the now-stale pyright suppressions.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../agent_framework_foundry/_foundry_evals.py | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index f242db06d91..8059c2ce990 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -598,15 +598,18 @@ async def _fetch_output_items(
             output_text: str | None = None
             response_id: str | None = None
 
-            sample = oi.sample
-            if sample is not None:  # pyright: ignore[reportUnnecessaryComparison]
-                err = sample.error
-                if err is not None and (err.code or err.message):  # pyright: ignore[reportUnnecessaryComparison]
+            # mypy infers oi.sample as dict[str, object] | None, but the
+            # OpenAI SDK actually returns a typed Sample model. Cast to Any so
+            # both type checkers accept the attribute access pattern.
+            oi_sample: Any = oi.sample
+            if oi_sample is not None:
+                err = oi_sample.error
+                if err is not None and (err.code or err.message):
                     error_code = err.code or None
                     error_message = err.message or None
 
-                usage = sample.usage
-                if usage is not None and usage.total_tokens:  # pyright: ignore[reportUnnecessaryComparison]
+                usage = oi_sample.usage
+                if usage is not None and usage.total_tokens:
                     token_usage = {
                         "prompt_tokens": usage.prompt_tokens,
                         "completion_tokens": usage.completion_tokens,
@@ -615,13 +618,13 @@ async def _fetch_output_items(
                     }
 
                 # Extract input/output text
-                if sample.input:
-                    parts = [si.content for si in sample.input if si.role == "user"]
+                if oi_sample.input:
+                    parts = [si.content for si in oi_sample.input if si.role == "user"]
                     if parts:
                         input_text = " ".join(parts)
 
-                if sample.output:
-                    parts = [so.content or "" for so in sample.output if so.role == "assistant"]
+                if oi_sample.output:
+                    parts = [so.content or "" for so in oi_sample.output if so.role == "assistant"]
                     if parts:
                         output_text = " ".join(parts)
 

From 93cf732b48a6c6a24013564014f2ecaae66d22af Mon Sep 17 00:00:00 2001
From: alliscode <25218250+alliscode@users.noreply.github.com>
Date: Fri, 29 May 2026 08:39:36 -0700
Subject: [PATCH 15/16] docs(foundry-evals): drop unpublished rubric-evaluators
 learn.microsoft.com link

The Adaptive Evals authoring docs are not yet published on Microsoft Learn, so the link 404s. Keep the descriptive text without the broken hyperlink; we can re-add it once the docs ship.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../05-end-to-end/evaluation/foundry_evals/README.md        | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
index 2f47c468612..e30ce6aa464 100644
--- a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md
@@ -38,10 +38,8 @@ uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py
 ### Referencing a rubric evaluator created in Foundry
 
 Foundry users can create rubric evaluators in the Foundry portal (or
-through the dedicated SDK / REST surface) — see
-[Rubric evaluators](https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/rubric-evaluators)
-for the authoring flow. Once an evaluator exists, agent-framework
-consumes it like any other evaluator: pass a
+through the dedicated SDK / REST surface). Once an evaluator exists,
+agent-framework consumes it like any other evaluator: pass a
 `GeneratedEvaluatorRef(name=..., version=...)` in the `evaluators=`
 list and pin the version for reproducible runs.
 

From fb3eb7fe7d9bc34d255fbb4cc66b9cfa763b2f28 Mon Sep 17 00:00:00 2001
From: alliscode <25218250+alliscode@users.noreply.github.com>
Date: Mon, 1 Jun 2026 11:43:51 -0700
Subject: [PATCH 16/16] test(foundry-evals): hoist repeated local imports to
 module top

Per code review feedback (eavanvalkenburg): the test file repeated 'from agent_framework_foundry._foundry_evals import ...' inside 22 test bodies and 'from agent_framework_foundry import GeneratedEvaluatorRef' inside 8 more. Move all of them to the existing top-level imports; the symbols are the same across tests and the local imports were redundant.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../foundry/tests/test_foundry_evals.py       | 39 +++++--------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index df8627352bb..8734650aafb 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -25,16 +25,25 @@
 from agent_framework._workflows._workflow import WorkflowRunResult
 from openai import AsyncOpenAI
 
+from agent_framework_foundry import GeneratedEvaluatorRef
 from agent_framework_foundry._foundry_evals import (
+    _AGENT_EVALUATORS,
+    _BUILTIN_EVALUATORS,
+    _TOOL_EVALUATORS,
     FoundryEvals,
     _build_item_schema,
     _build_testing_criteria,
     _extract_per_evaluator,
     _extract_result_counts,
+    _extract_rubric_scores,
+    _fetch_output_items,
     _filter_tool_evaluators,
+    _poll_eval_run,
     _resolve_default_evaluators,
     _resolve_evaluator,
     _resolve_openai_client,
+    evaluate_foundry_target,
+    evaluate_traces,
 )
 
 
@@ -807,7 +816,6 @@ def test_all_tool_evaluators_include_tool_definitions(self) -> None:
             assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions"
 
     def test_generated_evaluator_ref_pinned_version(self) -> None:
-        from agent_framework_foundry import GeneratedEvaluatorRef
 
         ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
         criteria = _build_testing_criteria([ref], "gpt-4o", include_data_mapping=True)
@@ -825,7 +833,6 @@ def test_generated_evaluator_ref_pinned_version(self) -> None:
         }
 
     def test_generated_evaluator_ref_display_name_used_as_short(self) -> None:
-        from agent_framework_foundry import GeneratedEvaluatorRef
 
         ref = GeneratedEvaluatorRef(name="my-rubric", version="2", display_name="My Rubric")
         criteria = _build_testing_criteria([ref], "gpt-4o")
@@ -834,7 +841,6 @@ def test_generated_evaluator_ref_display_name_used_as_short(self) -> None:
         assert criteria[0]["evaluator_name"] == "my-rubric"
 
     def test_generated_evaluator_ref_tool_definitions_added(self) -> None:
-        from agent_framework_foundry import GeneratedEvaluatorRef
 
         ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
         criteria = _build_testing_criteria(
@@ -849,8 +855,6 @@ def test_generated_evaluator_ref_tool_definitions_added(self) -> None:
     def test_generated_evaluator_ref_unpinned_warns(self, caplog: pytest.LogCaptureFixture) -> None:
         import logging
 
-        from agent_framework_foundry import GeneratedEvaluatorRef
-
         ref = GeneratedEvaluatorRef.latest("my-rubric")
         with caplog.at_level(logging.WARNING, logger="agent_framework_foundry._foundry_evals"):
             criteria = _build_testing_criteria([ref], "gpt-4o")
@@ -859,7 +863,6 @@ def test_generated_evaluator_ref_unpinned_warns(self, caplog: pytest.LogCaptureF
         assert any("no pinned version" in r.message for r in caplog.records)
 
     def test_generated_evaluator_ref_mixed_with_builtins(self) -> None:
-        from agent_framework_foundry import GeneratedEvaluatorRef
 
         ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
         criteria = _build_testing_criteria(
@@ -1331,7 +1334,6 @@ def test_raises_when_all_filtered(self) -> None:
             )
 
     def test_preserves_generated_ref_when_no_tools(self) -> None:
-        from agent_framework_foundry import GeneratedEvaluatorRef
 
         ref = GeneratedEvaluatorRef(name="rubric", version="1")
         items = [
@@ -1346,7 +1348,6 @@ def test_preserves_generated_ref_when_no_tools(self) -> None:
         assert "tool_call_accuracy" not in result
 
     def test_generated_ref_alone_does_not_raise(self) -> None:
-        from agent_framework_foundry import GeneratedEvaluatorRef
 
         ref = GeneratedEvaluatorRef(name="rubric", version="1")
         items = [
@@ -2359,7 +2360,6 @@ def test_raise_for_status_includes_errored_items(self) -> None:
 
 class TestFetchOutputItems:
     async def test_fetches_and_converts_output_items(self) -> None:
-        from agent_framework_foundry._foundry_evals import _fetch_output_items
 
         # Build mock output items matching the OpenAI SDK schema
         mock_result = MagicMock()
@@ -2421,7 +2421,6 @@ async def test_fetches_and_converts_output_items(self) -> None:
         assert item.error_code is None
 
     async def test_handles_errored_item(self) -> None:
-        from agent_framework_foundry._foundry_evals import _fetch_output_items
 
         mock_error = MagicMock()
         mock_error.code = "QueryExtractionError"
@@ -2453,7 +2452,6 @@ async def test_handles_errored_item(self) -> None:
         assert len(item.scores) == 0
 
     async def test_handles_api_failure_gracefully(self) -> None:
-        from agent_framework_foundry._foundry_evals import _fetch_output_items
 
         mock_client = MagicMock()
         mock_client.evals.runs.output_items.list = AsyncMock(side_effect=TypeError("API error"))
@@ -2462,7 +2460,6 @@ async def test_handles_api_failure_gracefully(self) -> None:
         assert items == []
 
     async def test_extracts_rubric_scores_from_dict_sample(self) -> None:
-        from agent_framework_foundry._foundry_evals import _fetch_output_items
 
         mock_result = MagicMock()
         mock_result.name = "my-rubric"
@@ -2504,7 +2501,6 @@ async def test_extracts_rubric_scores_from_dict_sample(self) -> None:
         assert safety.applicable is False
 
     async def test_no_rubric_scores_when_absent(self) -> None:
-        from agent_framework_foundry._foundry_evals import _fetch_output_items
 
         mock_result = MagicMock()
         mock_result.name = "relevance"
@@ -2529,7 +2525,6 @@ async def test_no_rubric_scores_when_absent(self) -> None:
 
 class TestExtractRubricScores:
     def test_handles_attribute_style_properties(self) -> None:
-        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
 
         rs = MagicMock()
         rs.id = "policy"
@@ -2549,7 +2544,6 @@ def test_handles_attribute_style_properties(self) -> None:
         assert result[0].weight == 2
 
     def test_top_level_rubric_scores_in_dict(self) -> None:
-        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
 
         sample = {"rubric_scores": [{"id": "a", "score": 3, "applicable": True, "weight": 1, "reason": "r"}]}
         result = _extract_rubric_scores(sample)
@@ -2557,14 +2551,12 @@ def test_top_level_rubric_scores_in_dict(self) -> None:
         assert result[0].id == "a"
 
     def test_returns_none_when_missing(self) -> None:
-        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
 
         assert _extract_rubric_scores(None) is None
         assert _extract_rubric_scores({}) is None
         assert _extract_rubric_scores({"properties": {}}) is None
 
     def test_skips_malformed_entries(self) -> None:
-        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
 
         sample = {
             "properties": {
@@ -2581,7 +2573,6 @@ def test_skips_malformed_entries(self) -> None:
 
     def test_canonical_dimension_scores_key_from_docs(self) -> None:
         """Per the Microsoft Learn docs, runtime output uses ``properties.dimension_scores``."""
-        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
 
         sample = {
             "properties": {
@@ -2611,7 +2602,6 @@ def test_canonical_dimension_scores_key_from_docs(self) -> None:
 
     def test_dimension_scores_via_attribute(self) -> None:
         """Canonical key also resolves when properties exposes ``dimension_scores`` as an attr."""
-        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
 
         rs = MagicMock()
         rs.id = "policy_enforcement"
@@ -2638,7 +2628,6 @@ def test_dimension_scores_via_attribute(self) -> None:
 class TestPollEvalRun:
     async def test_timeout_returns_timeout_status(self) -> None:
         """Poll timeout returns EvalResults with status='timeout'."""
-        from agent_framework_foundry._foundry_evals import _poll_eval_run
 
         mock_client = MagicMock()
         mock_pending = MagicMock()
@@ -2652,7 +2641,6 @@ async def test_timeout_returns_timeout_status(self) -> None:
 
     async def test_failed_run_returns_error(self) -> None:
         """Failed run returns EvalResults with error message."""
-        from agent_framework_foundry._foundry_evals import _poll_eval_run
 
         mock_client = MagicMock()
         mock_failed = MagicMock()
@@ -2670,7 +2658,6 @@ async def test_failed_run_returns_error(self) -> None:
 
     async def test_canceled_run_returns_canceled_status(self) -> None:
         """Canceled run returns EvalResults with status='canceled'."""
-        from agent_framework_foundry._foundry_evals import _poll_eval_run
 
         mock_client = MagicMock()
         mock_canceled = MagicMock()
@@ -2695,7 +2682,6 @@ async def test_canceled_run_returns_canceled_status(self) -> None:
 class TestEvaluateTraces:
     async def test_raises_without_required_args(self) -> None:
         """Raises ValueError when no response_ids, trace_ids, or agent_id given."""
-        from agent_framework_foundry._foundry_evals import evaluate_traces
 
         mock_client = MagicMock()
         with pytest.raises(ValueError, match="Provide at least one of"):
@@ -2706,7 +2692,6 @@ async def test_raises_without_required_args(self) -> None:
 
     async def test_response_ids_path(self) -> None:
         """evaluate_traces with response_ids uses the responses API path."""
-        from agent_framework_foundry._foundry_evals import evaluate_traces
 
         mock_client = MagicMock()
 
@@ -2754,7 +2739,6 @@ async def test_response_ids_path(self) -> None:
 
     async def test_trace_ids_path(self) -> None:
         """evaluate_traces with trace_ids builds azure_ai_traces data source."""
-        from agent_framework_foundry._foundry_evals import evaluate_traces
 
         mock_client = MagicMock()
 
@@ -2794,7 +2778,6 @@ async def test_trace_ids_path(self) -> None:
 class TestEvaluateFoundryTarget:
     async def test_happy_path(self) -> None:
         """evaluate_foundry_target creates eval + run and polls to completion."""
-        from agent_framework_foundry._foundry_evals import evaluate_foundry_target
 
         mock_client = MagicMock()
 
@@ -2930,13 +2913,11 @@ class TestEvaluatorSetConsistency:
     """Verify that _AGENT_EVALUATORS and _TOOL_EVALUATORS are subsets of _BUILTIN_EVALUATORS."""
 
     def test_agent_evaluators_subset(self):
-        from agent_framework_foundry._foundry_evals import _AGENT_EVALUATORS, _BUILTIN_EVALUATORS
 
         diff = _AGENT_EVALUATORS - set(_BUILTIN_EVALUATORS.values())
         assert not diff, f"_AGENT_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}"
 
     def test_tool_evaluators_subset(self):
-        from agent_framework_foundry._foundry_evals import _BUILTIN_EVALUATORS, _TOOL_EVALUATORS
 
         diff = _TOOL_EVALUATORS - set(_BUILTIN_EVALUATORS.values())
         assert not diff, f"_TOOL_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}"
@@ -2950,7 +2931,6 @@ def test_tool_evaluators_subset(self):
 class TestEvaluateTracesAgentId:
     async def test_agent_id_only_path(self) -> None:
         """evaluate_traces with agent_id only builds azure_ai_traces data source."""
-        from agent_framework_foundry._foundry_evals import evaluate_traces
 
         mock_client = MagicMock()
 
@@ -3008,7 +2988,6 @@ def test_all_tool_evaluators_no_tools_raises(self):
 class TestEvaluateFoundryTargetValidation:
     async def test_target_without_type_raises(self) -> None:
         """target dict without 'type' key raises ValueError."""
-        from agent_framework_foundry._foundry_evals import evaluate_foundry_target
 
         mock_client = MagicMock()
         with pytest.raises(ValueError, match="'type' key"):