diff --git a/python/packages/core/agent_framework/__init__.py b/python/packages/core/agent_framework/__init__.py
index 356051da3f..52368df476 100644
--- a/python/packages/core/agent_framework/__init__.py
+++ b/python/packages/core/agent_framework/__init__.py
@@ -70,6 +70,7 @@
     Evaluator,
     ExpectedToolCall,
     LocalEvaluator,
+    RubricScore,
     evaluate_agent,
     evaluate_workflow,
     evaluator,
@@ -425,6 +426,7 @@
     "ResponseStream",
     "Role",
     "RoleLiteral",
+    "RubricScore",
     "RunContext",
     "Runner",
     "RunnerContext",
diff --git a/python/packages/core/agent_framework/_agents.py b/python/packages/core/agent_framework/_agents.py
index 585898ae52..65506cadc6 100644
--- a/python/packages/core/agent_framework/_agents.py
+++ b/python/packages/core/agent_framework/_agents.py
@@ -444,6 +444,49 @@ def get_session(self, service_session_id: str, *, session_id: str | None = None)
         """
         return AgentSession(session_id=session_id, service_session_id=service_session_id)
 
+    def as_eval_source(
+        self,
+        *,
+        include_instructions: bool = True,
+        include_tools: bool = True,
+        include_context_providers: bool = False,
+        include_examples: bool = False,
+        examples: Sequence[str] | None = None,
+    ) -> str:
+        """Render this agent as a textual dossier for rubric-evaluator generation.
+
+        Packages the agent's name, description, instructions, tool
+        definitions, and optional context-provider class names into a
+        single plain-text dossier suitable for passing to a rubric
+        generation pipeline (e.g. ``FoundryEvals.generate_rubric``).
+
+        Defaults are conservative: instructions and tools are included;
+        examples and context-provider class names are not.
+
+        Keyword Args:
+            include_instructions: Whether to include the agent's
+                instructions text.
+            include_tools: Whether to include tool definitions.
+            include_context_providers: Whether to include attached
+                context-provider class names.
+            include_examples: Whether to include the supplied ``examples``.
+            examples: Sample queries / interactions to include when
+                ``include_examples`` is true.
+
+        Returns:
+            A plain-text dossier describing the agent.
+        """
+        from ._evaluation import _render_agent_dossier  # pyright: ignore[reportPrivateUsage]
+
+        return _render_agent_dossier(
+            self,
+            include_instructions=include_instructions,
+            include_tools=include_tools,
+            include_context_providers=include_context_providers,
+            include_examples=include_examples,
+            examples=examples,
+        )
+
     async def _run_after_providers(
         self,
         *,
diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py
index 64fab0eacb..b14bdee9b2 100644
--- a/python/packages/core/agent_framework/_evaluation.py
+++ b/python/packages/core/agent_framework/_evaluation.py
@@ -311,12 +311,15 @@ class EvalScoreResult:
         score: Numeric score from the evaluator.
         passed: Whether the item passed this evaluator's threshold.
         sample: Optional raw evaluator output (rationale, metadata).
+        dimensions: Per-dimension scores when this evaluator is a rubric
+            evaluator.  ``None`` for non-rubric (e.g. built-in) evaluators.
     """
 
     name: str
     score: float
     passed: bool | None = None
     sample: dict[str, Any] | None = None
+    dimensions: list[RubricScore] | None = None
 
 
 @experimental(feature_id=ExperimentalFeature.EVALS)
@@ -496,6 +499,313 @@ def raise_for_status(self, msg: str | None = None) -> None:
                     detail += f" Errored items: {', '.join(summaries)}."
             raise EvalNotPassedError(detail)
 
+    def assert_score_at_least(
+        self,
+        min_score: float,
+        *,
+        evaluator: str | None = None,
+        msg: str | None = None,
+    ) -> None:
+        """Assert every item's score (optionally filtered by evaluator) is ``>= min_score``.
+
+        Designed for CI gates on generated rubric evaluators (e.g.
+        ``results.assert_score_at_least(0.80)``).  Includes any
+        sub-results from workflow evaluations.
+
+        Args:
+            min_score: Minimum acceptable score (inclusive).
+            evaluator: When set, only check scores from the evaluator
+                whose ``EvalScoreResult.name`` matches.
+            msg: Optional custom failure message.
+
+        Raises:
+            EvalNotPassedError: When any matching score is below the threshold.
+        """
+        offenders: list[str] = []
+
+        def _check(results: EvalResults) -> None:
+            for item in results.items:
+                for score in item.scores:
+                    if evaluator is not None and score.name != evaluator:
+                        continue
+                    if score.score < min_score:
+                        offenders.append(f"{item.item_id}/{score.name}={score.score:.3f}")
+            for sub in results.sub_results.values():
+                _check(sub)
+
+        _check(self)
+        if offenders:
+            detail = msg or (
+                f"{len(offenders)} score(s) below threshold {min_score}"
+                f"{' for ' + evaluator if evaluator else ''}: {', '.join(offenders[:5])}"
+                + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "")
+            )
+            raise EvalNotPassedError(detail)
+
+    def assert_dimension_score_at_least(
+        self,
+        dimension_id: str,
+        min_score: float,
+        *,
+        evaluator: str | None = None,
+        require_applicable: bool = False,
+        msg: str | None = None,
+    ) -> None:
+        """Assert every item's score for a rubric *dimension* is ``>= min_score``.
+
+        Walks ``EvalScoreResult.dimensions`` looking for the named
+        dimension across all items (and sub-results).  Non-applicable
+        dimensions are skipped by default; pass
+        ``require_applicable=True`` to fail when no applicable score is
+        produced.
+
+        Args:
+            dimension_id: Dimension id (matches the rubric definition).
+            min_score: Minimum acceptable dimension score (inclusive).
+            evaluator: When set, only consider scores from the evaluator
+                whose ``EvalScoreResult.name`` matches.
+            require_applicable: When ``True``, missing or non-applicable
+                dimension scores raise.  Defaults to ``False`` (skip).
+            msg: Optional custom failure message.
+
+        Raises:
+            EvalNotPassedError: When the dimension fails the threshold.
+        """
+        offenders: list[str] = []
+        missing_items: list[str] = []
+
+        def _check(results: EvalResults) -> None:
+            for item in results.items:
+                found_applicable = False
+                for score in item.scores:
+                    if evaluator is not None and score.name != evaluator:
+                        continue
+                    if not score.dimensions:
+                        continue
+                    for rs in score.dimensions:
+                        if rs.id != dimension_id:
+                            continue
+                        if not rs.applicable:
+                            continue
+                        found_applicable = True
+                        if rs.score is None or rs.score < min_score:
+                            offenders.append(
+                                f"{item.item_id}/{score.name}/{dimension_id}="
+                                f"{rs.score if rs.score is not None else 'None'}"
+                            )
+                if require_applicable and not found_applicable:
+                    missing_items.append(item.item_id)
+            for sub in results.sub_results.values():
+                _check(sub)
+
+        _check(self)
+        problems: list[str] = []
+        if offenders:
+            problems.append(
+                f"{len(offenders)} dimension score(s) for '{dimension_id}' below {min_score}: "
+                f"{', '.join(offenders[:5])}" + (f" (+{len(offenders) - 5} more)" if len(offenders) > 5 else "")
+            )
+        if missing_items:
+            problems.append(
+                f"Dimension '{dimension_id}' not applicable on {len(missing_items)} item(s): "
+                f"{', '.join(missing_items[:5])}"
+            )
+        if problems:
+            raise EvalNotPassedError(msg or "; ".join(problems))
+
+    def assert_no_failed_items(self, msg: str | None = None) -> None:
+        """Assert no item ended in ``fail`` or ``error`` status.
+
+        Includes any sub-results from workflow evaluations.
+
+        Args:
+            msg: Optional custom failure message.
+
+        Raises:
+            EvalNotPassedError: When any item failed or errored.
+        """
+        bad: list[str] = []
+
+        def _check(results: EvalResults) -> None:
+            for item in results.items:
+                if item.is_failed or item.is_error:
+                    bad.append(f"{item.item_id}:{item.status}")
+            for sub in results.sub_results.values():
+                _check(sub)
+
+        _check(self)
+        if bad:
+            detail = msg or (
+                f"{len(bad)} item(s) failed or errored: {', '.join(bad[:5])}"
+                + (f" (+{len(bad) - 5} more)" if len(bad) > 5 else "")
+            )
+            raise EvalNotPassedError(detail)
+
+
+# endregion
+
+# region Generated rubric evaluators
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class RubricScore:
+    """A single dimension's score from a rubric-based evaluator run.
+
+    Rubric evaluators emit one ``RubricScore`` per dimension per item.
+    Attached to :class:`EvalScoreResult` as a typed view of the raw
+    ``properties.rubric_scores`` payload returned by providers such as
+    Foundry's generated rubric evaluators.
+
+    Attributes:
+        id: Dimension id (matches the rubric definition).
+        score: Numeric score, or ``None`` when the dimension was marked
+            non-applicable for this item.
+        applicable: Whether the dimension applied to this item.
+        weight: Dimension weight (mirrors the rubric definition).
+        reason: Short rationale produced by the evaluator.
+    """
+
+    id: str
+    score: int | None
+    applicable: bool
+    weight: int
+    reason: str
+
+
+# endregion
+
+# region Eval source rendering
+
+
+def _render_agent_dossier(
+    agent: Any,
+    *,
+    include_instructions: bool,
+    include_tools: bool,
+    include_context_providers: bool,
+    include_examples: bool,
+    examples: Sequence[str] | None,
+) -> str:
+    """Render a structured, plain-text dossier of an agent for rubric generation."""
+    lines: list[str] = []
+    name = getattr(agent, "name", None) or "<unnamed agent>"
+    description = getattr(agent, "description", None)
+    lines.append(f"Agent name: {name}")
+    if description:
+        lines.append(f"Description: {description}")
+
+    if include_instructions:
+        instructions: str | None = None
+        default_options: Any = getattr(agent, "default_options", None)
+        if isinstance(default_options, dict):
+            raw_instr: Any = cast("dict[str, Any]", default_options).get("instructions")
+            if isinstance(raw_instr, str) and raw_instr.strip():
+                instructions = raw_instr
+        if instructions is None:
+            raw_instr = getattr(agent, "instructions", None)
+            if isinstance(raw_instr, str) and raw_instr.strip():
+                instructions = raw_instr
+        if instructions:
+            lines.append("")
+            lines.append("Instructions:")
+            lines.append(instructions.strip())
+
+    if include_tools:
+        tool_defs = AgentEvalConverter.extract_tools(agent)
+        if tool_defs:
+            lines.append("")
+            lines.append("Tools:")
+            for tool in tool_defs:
+                tool_line = f"- {tool['name']}"
+                tool_desc = tool.get("description")
+                if tool_desc:
+                    tool_line += f": {tool_desc}"
+                lines.append(tool_line)
+                params = tool.get("parameters")
+                if params:
+                    try:
+                        params_json = json.dumps(params, sort_keys=True)
+                    except (TypeError, ValueError):
+                        params_json = str(params)
+                    lines.append(f"  parameters: {params_json}")
+
+    if include_context_providers:
+        providers = getattr(agent, "context_providers", None)
+        if providers:
+            lines.append("")
+            lines.append("Context providers:")
+            for provider in providers:
+                lines.append(f"- {type(provider).__name__}")
+
+    if include_examples and examples:
+        lines.append("")
+        lines.append("Examples:")
+        for idx, example in enumerate(examples, start=1):
+            lines.append(f"{idx}. {example}")
+
+    return "\n".join(lines).strip()
+
+
+def _render_workflow_dossier(  # pyright: ignore[reportUnusedFunction]
+    workflow: Workflow,
+    *,
+    include_instructions: bool,
+    include_tools: bool,
+    include_context_providers: bool,
+    include_examples: bool,
+    examples: Sequence[str] | None,
+    include_topology: bool,
+) -> str:
+    """Render a structured, plain-text dossier of a workflow for rubric generation."""
+    from ._workflows._agent_executor import AgentExecutor as _AE
+
+    lines: list[str] = []
+    name = workflow.name or "<unnamed workflow>"
+    lines.append(f"Workflow name: {name}")
+    if workflow.description:
+        lines.append(f"Description: {workflow.description}")
+
+    if include_topology:
+        try:
+            topology = json.dumps(workflow.to_dict(), sort_keys=True, default=str)
+        except (TypeError, ValueError) as exc:
+            logger.debug("Workflow.to_dict() failed during eval source export: %s", exc)
+            topology = None
+        if topology:
+            lines.append("")
+            lines.append("Topology (JSON):")
+            lines.append(topology)
+
+    agent_executors: list[tuple[str, Any]] = []
+    for executor_id, executor in workflow.executors.items():
+        if isinstance(executor, _AE):
+            agent_executors.append((executor_id, executor.agent))
+
+    if agent_executors:
+        lines.append("")
+        lines.append("Agents:")
+        for executor_id, agent in agent_executors:
+            lines.append("")
+            lines.append(f"Executor: {executor_id}")
+            dossier = _render_agent_dossier(
+                agent,
+                include_instructions=include_instructions,
+                include_tools=include_tools,
+                include_context_providers=include_context_providers,
+                include_examples=False,
+                examples=None,
+            )
+            lines.append(dossier)
+
+    if include_examples and examples:
+        lines.append("")
+        lines.append("Examples:")
+        for idx, example in enumerate(examples, start=1):
+            lines.append(f"{idx}. {example}")
+
+    return "\n".join(lines).strip()
+
 
 # endregion
 
diff --git a/python/packages/core/agent_framework/_workflows/_workflow.py b/python/packages/core/agent_framework/_workflows/_workflow.py
index 0493cd015f..bce7569ef1 100644
--- a/python/packages/core/agent_framework/_workflows/_workflow.py
+++ b/python/packages/core/agent_framework/_workflows/_workflow.py
@@ -410,6 +410,55 @@ def to_json(self) -> str:
         """Serialize the workflow definition to JSON."""
         return json.dumps(self.to_dict())
 
+    def as_eval_source(
+        self,
+        *,
+        include_instructions: bool = True,
+        include_tools: bool = True,
+        include_context_providers: bool = False,
+        include_examples: bool = False,
+        examples: Sequence[str] | None = None,
+        include_topology: bool = True,
+    ) -> str:
+        """Render this workflow as a textual dossier for rubric-evaluator generation.
+
+        Produces a plain-text dossier containing the workflow's name,
+        description, optional JSON-encoded topology (from
+        :meth:`Workflow.to_dict`), and per-agent dossiers extracted from
+        ``AgentExecutor`` nodes.  Suitable for passing to a rubric
+        generation pipeline (e.g. ``FoundryEvals.generate_rubric``).
+
+        Defaults are conservative: per-agent instructions and tools are
+        included, plus the JSON-encoded topology.  Examples and
+        context-provider class names are excluded by default.
+
+        Keyword Args:
+            include_instructions: Per-agent instructions inclusion.
+            include_tools: Per-agent tool-definition inclusion.
+            include_context_providers: Per-agent context-provider
+                inclusion.
+            include_examples: Whether to include workflow-level
+                ``examples``.
+            examples: Sample queries / interactions to include when
+                ``include_examples`` is true.
+            include_topology: Whether to embed the JSON-encoded workflow
+                topology in the rendered dossier.
+
+        Returns:
+            A plain-text dossier describing the workflow.
+        """
+        from .._evaluation import _render_workflow_dossier  # pyright: ignore[reportPrivateUsage]
+
+        return _render_workflow_dossier(
+            self,
+            include_instructions=include_instructions,
+            include_tools=include_tools,
+            include_context_providers=include_context_providers,
+            include_examples=include_examples,
+            examples=examples,
+            include_topology=include_topology,
+        )
+
     def get_start_executor(self) -> Executor:
         """Get the starting executor of the workflow.
 
diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py
index 96b0e1a391..e4c37dfb4b 100644
--- a/python/packages/core/tests/core/test_local_eval.py
+++ b/python/packages/core/tests/core/test_local_eval.py
@@ -5,14 +5,20 @@
 from __future__ import annotations
 
 import inspect
+from typing import Any
 
 import pytest
 
 from agent_framework._evaluation import (
     CheckResult,
     EvalItem,
+    EvalItemResult,
+    EvalNotPassedError,
+    EvalResults,
+    EvalScoreResult,
     ExpectedToolCall,
     LocalEvaluator,
+    RubricScore,
     _coerce_result,
     evaluator,
     keyword_check,
@@ -1010,19 +1016,300 @@ def test_all_passed_parent_fails_when_own_counts_fail(self):
 
 
 # ---------------------------------------------------------------------------
-# r5 review: _build_overall_item with empty outputs
+# Rubric assertions (EvalResults.assert_*)
 # ---------------------------------------------------------------------------
 
 
-class TestBuildOverallItemEmpty:
-    """Test _build_overall_item returns None for empty workflow outputs."""
+def _rubric_results(*scores_per_item: list[EvalScoreResult]) -> EvalResults:
+    items = [
+        EvalItemResult(item_id=f"item-{i}", status="pass", scores=scores) for i, scores in enumerate(scores_per_item)
+    ]
+    return EvalResults(
+        provider="test",
+        eval_id="ev1",
+        run_id="run1",
+        result_counts={"passed": len(items), "failed": 0, "errored": 0, "total": len(items)},
+        items=items,
+    )
+
+
+class TestRubricAssertions:
+    """Tests for EvalResults.assert_dimension_score_at_least."""
+
+    def test_dimension_at_or_above_threshold_passes(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="policy",
+                    score=0.9,
+                    dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")],
+                )
+            ],
+        )
+        # Should not raise.
+        results.assert_dimension_score_at_least("clarity", 3)
+
+    def test_dimension_below_threshold_raises(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="policy",
+                    score=0.5,
+                    dimensions=[RubricScore(id="clarity", score=2, applicable=True, weight=1, reason="")],
+                )
+            ],
+        )
+        with pytest.raises(EvalNotPassedError):
+            results.assert_dimension_score_at_least("clarity", 3)
+
+    def test_non_applicable_skipped_by_default(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="policy",
+                    score=1.0,
+                    dimensions=[RubricScore(id="clarity", score=None, applicable=False, weight=1, reason="n/a")],
+                )
+            ],
+        )
+        # No applicable scores; default behaviour is to skip silently.
+        results.assert_dimension_score_at_least("clarity", 3)
+
+    def test_require_applicable_raises_when_dimension_absent(self) -> None:
+        results = _rubric_results(
+            [EvalScoreResult(name="policy", score=1.0, dimensions=[])],
+        )
+        with pytest.raises(EvalNotPassedError, match="not applicable"):
+            results.assert_dimension_score_at_least("clarity", 3, require_applicable=True)
+
+    def test_require_applicable_raises_when_filtered_evaluator_missing(self) -> None:
+        # Regression: previously the (not evaluator or found_any) guard caused
+        # this case to silently pass even with require_applicable=True.
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="other",
+                    score=0.9,
+                    dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")],
+                )
+            ],
+        )
+        with pytest.raises(EvalNotPassedError, match="not applicable"):
+            results.assert_dimension_score_at_least("clarity", 3, evaluator="policy", require_applicable=True)
+
+    def test_evaluator_filter_isolates_offenders(self) -> None:
+        results = _rubric_results(
+            [
+                EvalScoreResult(
+                    name="other",
+                    score=0.1,
+                    dimensions=[RubricScore(id="clarity", score=1, applicable=True, weight=1, reason="")],
+                ),
+                EvalScoreResult(
+                    name="policy",
+                    score=0.9,
+                    dimensions=[RubricScore(id="clarity", score=4, applicable=True, weight=1, reason="")],
+                ),
+            ],
+        )
+        # The low-scoring "other" evaluator is filtered out; "policy" passes.
+        results.assert_dimension_score_at_least("clarity", 3, evaluator="policy")
+
+
+# ---------------------------------------------------------------------------
+# Eval source rendering (string dossiers)
+# ---------------------------------------------------------------------------
+
 
-    def test_returns_none_for_empty_outputs(self):
+class TestAgentAsEvalSource:
+    """Tests for BaseAgent.as_eval_source / _render_agent_dossier."""
+
+    def _make_mock_agent(
+        self,
+        *,
+        name: str = "weather-bot",
+        description: str | None = "Looks up the weather.",
+        instructions: str | None = "Be concise.  Always cite the source.",
+        tools: list[Any] | None = None,
+        context_providers: list[Any] | None = None,
+        mcp_tools: list[Any] | None = None,
+    ) -> Any:
         from unittest.mock import MagicMock
 
-        from agent_framework._evaluation import _build_overall_item
+        from agent_framework._tools import ai_function
+
+        agent = MagicMock()
+        agent.name = name
+        agent.description = description
+        agent.default_options = {"instructions": instructions, "tools": tools or []}
+        agent.context_providers = context_providers or []
+        agent.mcp_tools = mcp_tools or []
+        if tools:
+            normalized: list[Any] = []
+            for t in tools:
+                if callable(t) and not hasattr(t, "parameters"):
+                    normalized.append(ai_function(t))
+                else:
+                    normalized.append(t)
+            agent.default_options["tools"] = normalized
+        return agent
+
+    def _render(self, agent: Any, **overrides: Any) -> str:
+        from agent_framework._evaluation import _render_agent_dossier
+
+        kwargs: dict[str, Any] = {
+            "include_instructions": True,
+            "include_tools": True,
+            "include_context_providers": False,
+            "include_examples": False,
+            "examples": None,
+        }
+        kwargs.update(overrides)
+        return _render_agent_dossier(agent, **kwargs)
+
+    def test_basic_dossier_includes_name_and_instructions(self):
+        agent = self._make_mock_agent()
+        dossier = self._render(agent)
+        assert isinstance(dossier, str)
+        assert "Agent name: weather-bot" in dossier
+        assert "Description: Looks up the weather." in dossier
+        assert "Instructions:" in dossier
+        assert "Be concise." in dossier
+
+    def test_tools_section_includes_definitions(self):
+        def get_weather(city: str) -> str:
+            """Return the current weather for *city*."""
+            return f"sunny in {city}"
+
+        agent = self._make_mock_agent(tools=[get_weather])
+        dossier = self._render(agent)
+        assert "Tools:" in dossier
+        assert "- get_weather" in dossier
+        assert '"city"' in dossier
+
+    def test_include_instructions_false_omits_section(self):
+        agent = self._make_mock_agent()
+        dossier = self._render(agent, include_instructions=False)
+        assert "Instructions:" not in dossier
+
+    def test_include_tools_false_omits_section(self):
+        def get_weather(city: str) -> str:
+            return f"sunny in {city}"
+
+        agent = self._make_mock_agent(tools=[get_weather])
+        dossier = self._render(agent, include_tools=False)
+        assert "Tools:" not in dossier
+
+    def test_context_providers_excluded_by_default_but_included_when_opted_in(self):
+        class StubProvider:
+            pass
+
+        agent = self._make_mock_agent(context_providers=[StubProvider()])
+        default_dossier = self._render(agent)
+        assert "Context providers:" not in default_dossier
+
+        opt_in_dossier = self._render(agent, include_context_providers=True)
+        assert "Context providers:" in opt_in_dossier
+        assert "- StubProvider" in opt_in_dossier
+
+    def test_examples_excluded_by_default_but_included_when_opted_in(self):
+        agent = self._make_mock_agent()
+        default_dossier = self._render(agent, examples=["What's the weather in NYC?"])
+        assert "Examples:" not in default_dossier
+
+        opt_in_dossier = self._render(
+            agent,
+            include_examples=True,
+            examples=["What's the weather in NYC?"],
+        )
+        assert "Examples:" in opt_in_dossier
+        assert "What's the weather in NYC?" in opt_in_dossier
+
+    def test_base_agent_method_returns_dossier_string(self):
+        from agent_framework._agents import BaseAgent
+
+        class _ConcreteAgent(BaseAgent):
+            pass
+
+        agent = _ConcreteAgent(name="test-agent", description="A test agent.")
+        dossier = agent.as_eval_source()
+        assert isinstance(dossier, str)
+        assert "Agent name: test-agent" in dossier
+
+
+class TestWorkflowAsEvalSource:
+    """Tests for Workflow.as_eval_source / _render_workflow_dossier."""
+
+    def _build_workflow(self, *, with_agent: bool = False) -> Any:
+        from unittest.mock import MagicMock
 
-        mock_result = MagicMock()
-        mock_result.get_outputs.return_value = []
-        item = _build_overall_item("Hello", mock_result)
-        assert item is None
+        from agent_framework._workflows._agent_executor import AgentExecutor
+
+        workflow = MagicMock()
+        workflow.name = "demo-workflow"
+        workflow.description = "Routes user questions through a single agent."
+        workflow.to_dict.return_value = {
+            "name": "demo-workflow",
+            "id": "wf_1",
+            "start_executor_id": "agent_1",
+            "edge_groups": [],
+            "executors": {"agent_1": {"type": "AgentExecutor"}},
+        }
+
+        if with_agent:
+            inner_agent = MagicMock()
+            inner_agent.name = "inner-agent"
+            inner_agent.description = "Inner agent."
+            inner_agent.default_options = {"instructions": "Answer politely.", "tools": []}
+            inner_agent.context_providers = []
+            inner_agent.mcp_tools = []
+
+            executor = MagicMock(spec=AgentExecutor)
+            executor.agent = inner_agent
+            workflow.executors = {"agent_1": executor}
+        else:
+            workflow.executors = {}
+        return workflow
+
+    def _render(self, workflow: Any, **overrides: Any) -> str:
+        from agent_framework._evaluation import _render_workflow_dossier
+
+        kwargs: dict[str, Any] = {
+            "include_instructions": True,
+            "include_tools": True,
+            "include_context_providers": False,
+            "include_examples": False,
+            "examples": None,
+            "include_topology": True,
+        }
+        kwargs.update(overrides)
+        return _render_workflow_dossier(workflow, **kwargs)
+
+    def test_emits_dossier_with_topology(self):
+        workflow = self._build_workflow()
+        dossier = self._render(workflow)
+        assert isinstance(dossier, str)
+        assert "Workflow name: demo-workflow" in dossier
+        assert "Topology (JSON):" in dossier
+        assert '"start_executor_id": "agent_1"' in dossier
+
+    def test_topology_can_be_disabled(self):
+        workflow = self._build_workflow()
+        dossier = self._render(workflow, include_topology=False)
+        assert "Topology (JSON):" not in dossier
+
+    def test_per_agent_dossiers_included_when_executor_is_agent_executor(self):
+        workflow = self._build_workflow(with_agent=True)
+        dossier = self._render(workflow)
+        assert "Agents:" in dossier
+        assert "Executor: agent_1" in dossier
+        assert "Agent name: inner-agent" in dossier
+        assert "Answer politely." in dossier
+
+    def test_workflow_examples_excluded_by_default(self):
+        workflow = self._build_workflow()
+        default_dossier = self._render(workflow, examples=["Hi"])
+        assert "Examples:" not in default_dossier
+
+        opt_in_dossier = self._render(workflow, examples=["Hi"], include_examples=True)
+        assert "Examples:" in opt_in_dossier
diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py
index 002e63f8a6..efbe0b8d24 100644
--- a/python/packages/foundry/agent_framework_foundry/__init__.py
+++ b/python/packages/foundry/agent_framework_foundry/__init__.py
@@ -10,10 +10,22 @@
     FoundryEmbeddingSettings,
     RawFoundryEmbeddingClient,
 )
+from ._evals_config import (
+    RubricGenerationSpec,
+    RubricSourceSpec,
+    build_sources,
+    load_evals_config,
+    parse_evals_config,
+)
 from ._foundry_evals import (
+    EvalGenerationSource,
     FoundryEvals,
+    GeneratedEvaluatorRef,
+    RubricDimension,
+    agent_as_eval_source,
     evaluate_foundry_target,
     evaluate_traces,
+    workflow_as_eval_source,
 )
 from ._memory_provider import FoundryMemoryProvider
 
@@ -23,6 +35,7 @@
     __version__ = "0.0.0"
 
 __all__ = [
+    "EvalGenerationSource",
     "FoundryAgent",
     "FoundryAgentOptions",
     "FoundryChatClient",
@@ -32,11 +45,20 @@
     "FoundryEmbeddingSettings",
     "FoundryEvals",
     "FoundryMemoryProvider",
+    "GeneratedEvaluatorRef",
     "RawFoundryAgent",
     "RawFoundryAgentChatClient",
     "RawFoundryChatClient",
     "RawFoundryEmbeddingClient",
+    "RubricDimension",
+    "RubricGenerationSpec",
+    "RubricSourceSpec",
     "__version__",
+    "agent_as_eval_source",
+    "build_sources",
     "evaluate_foundry_target",
     "evaluate_traces",
+    "load_evals_config",
+    "parse_evals_config",
+    "workflow_as_eval_source",
 ]
diff --git a/python/packages/foundry/agent_framework_foundry/_evals_config.py b/python/packages/foundry/agent_framework_foundry/_evals_config.py
new file mode 100644
index 0000000000..5f45e2854b
--- /dev/null
+++ b/python/packages/foundry/agent_framework_foundry/_evals_config.py
@@ -0,0 +1,403 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""YAML-driven evaluator configuration for rubric generation and evaluation.
+
+Defines the source-controlled config schema described in
+``adaptive-evals-draft.md``: a list of named rubric-generation specs that
+CI jobs and harnesses parse to drive
+:meth:`FoundryEvals.generate_rubric`.
+
+Example config:
+
+.. code-block:: yaml
+
+    evaluators:
+      reservation-agent-quality:
+        type: foundry.generated_rubric
+        category: quality
+        model: gpt-4o
+        agent: reservation-agent
+        sources:
+          - type: agent
+            include_instructions: true
+            include_tools: true
+          - type: dataset
+            name: reservation-business-rules
+            version: "1"
+
+Example loader usage:
+
+.. code-block:: python
+
+    from agent_framework_foundry import load_evals_config, FoundryEvals
+
+    config = load_evals_config("evaluators.yaml")
+    spec = config["reservation-agent-quality"]
+    sources = build_sources(spec, agent=agent)
+    ref = await FoundryEvals.generate_rubric(
+        project_client=client,
+        name=spec.name,
+        sources=sources,
+        category=spec.category,
+        model=spec.model,
+        display_name=spec.display_name,
+        description=spec.description,
+    )
+"""
+
+from __future__ import annotations
+
+import os
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Literal, cast
+
+from agent_framework._feature_stage import ExperimentalFeature, experimental
+
+from ._foundry_evals import (
+    EvalGenerationSource,
+    agent_as_eval_source,
+    workflow_as_eval_source,
+)
+
+_RUBRIC_TYPE = "foundry.generated_rubric"
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class RubricSourceSpec:
+    """A single source entry in a :class:`RubricGenerationSpec` ``sources`` list.
+
+    Mirrors the per-source YAML schema.  The :attr:`type` field is the
+    discriminator; only the fields relevant to each type are read.
+
+    Attributes:
+        type: One of ``"agent"``, ``"workflow"``, ``"prompt"``,
+            ``"dataset"``, ``"traces"``.
+        description: Optional description shown in Foundry UI.
+        include_instructions: Whether to include the bound agent /
+            workflow's instructions.  Applies to ``"agent"`` and
+            ``"workflow"`` types.
+        include_tools: Whether to include the bound agent / workflow's
+            tools.  Applies to ``"agent"`` and ``"workflow"`` types.
+        include_context_providers: Whether to include attached
+            context-provider class names.  Applies to ``"agent"`` and
+            ``"workflow"`` types.
+        include_examples: Whether to include ``examples``.  Applies to
+            ``"agent"`` and ``"workflow"`` types.
+        include_topology: Whether to include the JSON-encoded topology.
+            Applies to ``"workflow"`` type.
+        examples: Optional list of example queries for ``"agent"`` /
+            ``"workflow"`` sources.
+        prompt: Rendered dossier for ``"prompt"`` type.
+        agent_name: Hosted Foundry agent name for ``"agent"`` type with
+            a server-side reference.
+        name: Dataset name for ``"dataset"`` type.
+        version: Pinned dataset version.
+        metadata: Free-form metadata for ``"traces"`` sources.
+    """
+
+    type: Literal["agent", "workflow", "prompt", "dataset", "traces"]
+    description: str | None = None
+    include_instructions: bool = True
+    include_tools: bool = True
+    include_context_providers: bool = False
+    include_examples: bool = False
+    include_topology: bool = True
+    examples: tuple[str, ...] = field(default_factory=tuple)
+    prompt: str | None = None
+    agent_name: str | None = None
+    name: str | None = None
+    version: str | None = None
+    metadata: dict[str, Any] | None = None
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class RubricGenerationSpec:
+    """A single named entry from an evaluators YAML config.
+
+    Attributes:
+        name: Evaluator name (the YAML key under ``evaluators``).
+        type: Discriminator literal.  Must be
+            ``"foundry.generated_rubric"`` for rubric evaluators.
+        category: ``"quality"`` or ``"safety"``.
+        model: Optional model deployment to drive generation.
+        agent: Optional symbolic reference to the agent in the
+            caller's harness.  Resolved by user code into a
+            :class:`BaseAgent` and passed to
+            :func:`build_sources`.
+        workflow: Optional symbolic reference to a workflow.
+        display_name: Optional human-readable name.
+        description: Optional description.
+        sources: List of source specs to feed into generation.  When
+            empty, callers typically default to a single
+            ``RubricSourceSpec(type='agent')`` or
+            ``RubricSourceSpec(type='workflow')`` source.
+    """
+
+    name: str
+    type: str = _RUBRIC_TYPE
+    category: Literal["quality", "safety"] = "quality"
+    model: str | None = None
+    agent: str | None = None
+    workflow: str | None = None
+    display_name: str | None = None
+    description: str | None = None
+    sources: tuple[RubricSourceSpec, ...] = field(default_factory=tuple)
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+def load_evals_config(path: str | os.PathLike[str]) -> dict[str, RubricGenerationSpec]:
+    """Load a YAML evaluators config and return a name -> spec mapping.
+
+    Reads ``path`` (UTF-8) and parses the top-level ``evaluators``
+    mapping into :class:`RubricGenerationSpec` instances keyed by name.
+
+    Requires ``PyYAML``.  Raises :class:`ImportError` with a helpful
+    message when PyYAML is not installed.
+
+    Args:
+        path: Filesystem path to the YAML config.
+
+    Returns:
+        A dict mapping evaluator name to :class:`RubricGenerationSpec`.
+
+    Raises:
+        ImportError: If PyYAML is not installed.
+        ValueError: If the YAML file is malformed.
+    """
+    try:
+        import yaml  # type: ignore[import-untyped]
+    except ImportError as exc:
+        raise ImportError("load_evals_config requires PyYAML.  Install with `pip install pyyaml`.") from exc
+
+    raw = yaml.safe_load(Path(path).read_text(encoding="utf-8"))
+    return parse_evals_config(raw)
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+def parse_evals_config(data: Any) -> dict[str, RubricGenerationSpec]:
+    """Parse an already-loaded YAML mapping into rubric-generation specs.
+
+    Useful when callers manage YAML loading themselves (e.g. CI that
+    interpolates env vars before parsing).
+
+    Args:
+        data: A mapping with an ``"evaluators"`` key containing a mapping
+            of evaluator names to spec dicts.
+
+    Returns:
+        A dict mapping evaluator name to :class:`RubricGenerationSpec`.
+
+    Raises:
+        ValueError: If the structure is malformed.
+    """
+    if not isinstance(data, Mapping):
+        raise ValueError("Evaluators config must be a mapping.")
+    data_map = cast("Mapping[str, Any]", data)
+    raw_evaluators = data_map.get("evaluators")
+    if raw_evaluators is None:
+        raise ValueError("Evaluators config is missing a top-level 'evaluators' key.")
+    if not isinstance(raw_evaluators, Mapping):
+        raise ValueError("Evaluators config 'evaluators' entry must be a mapping.")
+    evaluators = cast("Mapping[str, Any]", raw_evaluators)
+
+    parsed: dict[str, RubricGenerationSpec] = {}
+    for name, raw in evaluators.items():
+        if not isinstance(raw, Mapping):
+            raise ValueError(f"Evaluator entry {name!r} must be a mapping, got {type(raw).__name__}.")
+        raw_map = cast("Mapping[str, Any]", raw)
+        parsed[name] = _parse_spec(name, raw_map)
+    return parsed
+
+
+def _parse_spec(name: str, raw: Mapping[str, Any]) -> RubricGenerationSpec:
+    type_value = raw.get("type", _RUBRIC_TYPE)
+    if type_value != _RUBRIC_TYPE:
+        raise ValueError(f"Evaluator {name!r} has unsupported type {type_value!r}; expected {_RUBRIC_TYPE!r}.")
+    category = raw.get("category", "quality")
+    if category not in ("quality", "safety"):
+        raise ValueError(f"Evaluator {name!r} has invalid category {category!r}; expected 'quality' or 'safety'.")
+
+    raw_sources_obj: Any = raw.get("sources") or ()
+    if not isinstance(raw_sources_obj, (list, tuple)):
+        raise ValueError(f"Evaluator {name!r} 'sources' must be a list.")
+    sources_iter: list[Any] = list(cast("Any", raw_sources_obj))
+    sources: list[RubricSourceSpec] = []
+    for index, raw_source in enumerate(sources_iter):
+        if not isinstance(raw_source, Mapping):
+            raise ValueError(
+                f"Evaluator {name!r} source entry {index} must be a mapping, got {type(raw_source).__name__}."
+            )
+        sources.append(_parse_source(name, index, cast("Mapping[str, Any]", raw_source)))
+
+    return RubricGenerationSpec(
+        name=name,
+        type=type_value,
+        category=category,
+        model=raw.get("model"),
+        agent=raw.get("agent"),
+        workflow=raw.get("workflow"),
+        display_name=raw.get("display_name"),
+        description=raw.get("description"),
+        sources=tuple(sources),
+    )
+
+
+def _parse_source(spec_name: str, index: int, raw: Mapping[str, Any]) -> RubricSourceSpec:
+    type_value = raw.get("type")
+    if type_value not in ("agent", "workflow", "prompt", "dataset", "traces"):
+        raise ValueError(
+            f"Evaluator {spec_name!r} source {index} has invalid type {type_value!r}; "
+            "expected one of 'agent', 'workflow', 'prompt', 'dataset', 'traces'."
+        )
+
+    examples_raw: Any = raw.get("examples") or ()
+    if not isinstance(examples_raw, (list, tuple)):
+        raise ValueError(f"Evaluator {spec_name!r} source {index} 'examples' must be a list.")
+    examples_iter: list[Any] = list(cast("Any", examples_raw))
+    examples = tuple(str(e) for e in examples_iter)
+
+    metadata_raw = raw.get("metadata")
+    if metadata_raw is not None and not isinstance(metadata_raw, Mapping):
+        raise ValueError(f"Evaluator {spec_name!r} source {index} 'metadata' must be a mapping.")
+
+    return RubricSourceSpec(
+        type=cast("Any", type_value),
+        description=raw.get("description"),
+        include_instructions=bool(raw.get("include_instructions", True)),
+        include_tools=bool(raw.get("include_tools", True)),
+        include_context_providers=bool(raw.get("include_context_providers", False)),
+        include_examples=bool(raw.get("include_examples", False)),
+        include_topology=bool(raw.get("include_topology", True)),
+        examples=examples,
+        prompt=raw.get("prompt"),
+        agent_name=raw.get("agent_name"),
+        name=raw.get("name"),
+        version=str(raw.get("version")) if raw.get("version") is not None else None,
+        metadata=dict(cast("Mapping[str, Any]", metadata_raw)) if metadata_raw is not None else None,
+    )
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+def build_sources(
+    spec: RubricGenerationSpec,
+    *,
+    agent: Any | None = None,
+    workflow: Any | None = None,
+) -> list[EvalGenerationSource]:
+    """Translate a spec's source list into :class:`EvalGenerationSource` instances.
+
+    Resolves each :class:`RubricSourceSpec` against the supplied
+    ``agent`` and ``workflow`` instances:
+
+    * ``type='agent'`` sources call :func:`agent_as_eval_source` with
+      the spec's include-flags.  If the source carries an
+      ``agent_name`` the agent is referenced server-side instead.
+    * ``type='workflow'`` sources call
+      :func:`workflow_as_eval_source` with the spec's include-flags.
+    * ``type='prompt'``, ``type='dataset'``, and ``type='traces'``
+      sources are translated directly into
+      :class:`EvalGenerationSource` instances without consulting the
+      runtime agent or workflow.
+
+    When the spec has no ``sources`` entries, defaults to a single
+    ``type='agent'`` source when an ``agent`` is provided, or a single
+    ``type='workflow'`` source when a ``workflow`` is provided.
+
+    Args:
+        spec: Parsed :class:`RubricGenerationSpec`.
+        agent: Optional agent instance for ``type='agent'`` sources.
+        workflow: Optional workflow instance for ``type='workflow'``
+            sources.
+
+    Returns:
+        A list of :class:`EvalGenerationSource` instances ready to pass
+        to :meth:`FoundryEvals.generate_rubric` as ``sources=``.
+
+    Raises:
+        ValueError: If a source references an agent or workflow that
+            was not supplied.
+    """
+    if not spec.sources:
+        if agent is not None:
+            return [agent_as_eval_source(agent)]
+        if workflow is not None:
+            return [workflow_as_eval_source(workflow)]
+        raise ValueError(f"Spec {spec.name!r} has no sources and no agent/workflow was provided to build_sources().")
+
+    out: list[EvalGenerationSource] = []
+    for src in spec.sources:
+        if src.type == "agent":
+            if src.agent_name:
+                out.append(
+                    EvalGenerationSource(
+                        type="agent",
+                        agent_name=src.agent_name,
+                        description=src.description,
+                    )
+                )
+                continue
+            if agent is None:
+                raise ValueError(f"Spec {spec.name!r} has a source of type 'agent' but no agent= was provided.")
+            out.append(
+                agent_as_eval_source(
+                    agent,
+                    include_instructions=src.include_instructions,
+                    include_tools=src.include_tools,
+                    include_context_providers=src.include_context_providers,
+                    include_examples=src.include_examples,
+                    examples=list(src.examples) if src.examples else None,
+                )
+            )
+        elif src.type == "workflow":
+            if workflow is None:
+                raise ValueError(f"Spec {spec.name!r} has a source of type 'workflow' but no workflow= was provided.")
+            out.append(
+                workflow_as_eval_source(
+                    workflow,
+                    include_instructions=src.include_instructions,
+                    include_tools=src.include_tools,
+                    include_context_providers=src.include_context_providers,
+                    include_examples=src.include_examples,
+                    examples=list(src.examples) if src.examples else None,
+                    include_topology=src.include_topology,
+                )
+            )
+        elif src.type == "prompt":
+            if not src.prompt:
+                raise ValueError(f"Spec {spec.name!r} has a 'prompt' source missing the 'prompt' field.")
+            out.append(EvalGenerationSource(type="prompt", prompt=src.prompt, description=src.description))
+        elif src.type == "dataset":
+            if not src.name:
+                raise ValueError(f"Spec {spec.name!r} has a 'dataset' source missing the 'name' field.")
+            out.append(
+                EvalGenerationSource(
+                    type="dataset",
+                    dataset_name=src.name,
+                    dataset_version=src.version,
+                    description=src.description,
+                )
+            )
+        elif src.type == "traces":
+            out.append(
+                EvalGenerationSource(
+                    type="traces",
+                    description=src.description,
+                    metadata=src.metadata,
+                )
+            )
+        else:  # pragma: no cover - guarded by _parse_source
+            raise ValueError(f"Spec {spec.name!r} has unknown source type {src.type!r}.")
+    return out
+
+
+__all__ = [
+    "RubricGenerationSpec",
+    "RubricSourceSpec",
+    "build_sources",
+    "load_evals_config",
+    "parse_evals_config",
+]
diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
index eef58b0a04..2b8d7913e0 100644
--- a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
+++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py
@@ -29,7 +29,8 @@
 import asyncio
 import logging
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Any
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 from agent_framework._evaluation import (
     AgentEvalConverter,
@@ -39,6 +40,7 @@
     EvalItemResult,
     EvalResults,
     EvalScoreResult,
+    RubricScore,
 )
 from agent_framework._feature_stage import ExperimentalFeature, experimental
 from openai import AsyncOpenAI
@@ -46,11 +48,335 @@
 from ._chat_client import FoundryChatClient
 
 if TYPE_CHECKING:
+    from agent_framework._agents import BaseAgent
+    from agent_framework._workflows._workflow import Workflow
     from azure.ai.projects.aio import AIProjectClient
     from openai.types.evals import RunRetrieveResponse
 
 logger = logging.getLogger(__name__)
 
+
+# region Generated rubric evaluator types
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class RubricDimension:
+    """A single dimension of a generated rubric evaluator.
+
+    Rubric evaluators score each item along one or more named dimensions,
+    each with its own description and weight.  Foundry's evaluator
+    generation pipeline produces these dimensions from agent/workflow
+    metadata; ``RubricDimension`` surfaces them so callers can inspect a
+    generated evaluator's structure without round-tripping through the
+    portal.
+
+    Attributes:
+        id: Stable identifier for the dimension (e.g. ``"policy_enforcement"``).
+        description: Natural-language description of what the dimension scores.
+        weight: Integer weight controlling the dimension's contribution to
+            the aggregate score.
+        always_applicable: When ``False``, evaluators may mark this
+            dimension non-applicable on a per-item basis.
+    """
+
+    id: str
+    description: str
+    weight: int
+    always_applicable: bool = False
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class GeneratedEvaluatorRef:
+    """A reference to a generated rubric evaluator stored in Foundry.
+
+    Pass instances of this class to :class:`FoundryEvals` to score items
+    with a previously generated rubric evaluator.  Construct directly
+    when the evaluator already exists, or obtain one from
+    :meth:`FoundryEvals.generate_rubric`.
+
+    Pinning ``version`` is strongly recommended so evaluation runs are
+    reproducible.  The dataclass accepts ``version=None`` for the
+    convenience of :meth:`latest`, but ``FoundryEvals`` emits a warning
+    whenever a versionless reference is used; CI gates should always
+    pass a concrete version.
+
+    Attributes:
+        name: Evaluator name as stored in the Foundry project (e.g.
+            ``"my-policy-evaluator"``).  Distinct from built-in
+            evaluators such as ``"builtin.relevance"``.
+        version: Pinned evaluator version.  ``None`` means "latest" —
+            this is discouraged for CI/repro and ``FoundryEvals`` will
+            emit a warning when used.
+        category: ``"quality"`` for ungrounded rubric scoring,
+            ``"safety"`` for safety-focused evaluators.  Matches the
+            Foundry evaluator's declared category.
+        display_name: Optional human-readable name used in result
+            summaries.  Defaults to ``name`` when unset.
+        description: Optional description carried over from the
+            generated evaluator definition for documentation.
+        dimensions: Optional snapshot of the rubric's dimensions for
+            inspection.  Not required to invoke the evaluator — the
+            service uses the persisted definition.
+        pass_threshold: Optional aggregate score threshold (0.0-1.0) the
+            evaluator considers a passing item.  ``None`` defers to the
+            evaluator's stored default.
+    """
+
+    name: str
+    version: str | None = None
+    category: Literal["quality", "safety"] = "quality"
+    display_name: str | None = None
+    description: str | None = None
+    dimensions: tuple[RubricDimension, ...] | None = None
+    pass_threshold: float | None = None
+
+    @classmethod
+    def latest(
+        cls,
+        name: str,
+        *,
+        category: Literal["quality", "safety"] = "quality",
+        display_name: str | None = None,
+        description: str | None = None,
+    ) -> GeneratedEvaluatorRef:
+        """Construct a versionless reference (resolves to the latest version at run time).
+
+        Discouraged for reproducible runs.  Prefer the constructor with
+        an explicit ``version`` so CI and replay evaluations stay stable
+        when the evaluator is regenerated.
+        """
+        return cls(
+            name=name,
+            version=None,
+            category=category,
+            display_name=display_name,
+            description=description,
+        )
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+@dataclass(frozen=True)
+class EvalGenerationSource:
+    """A source description passed to Foundry's evaluator generation pipeline.
+
+    Rubric evaluator generation consumes one or more sources that describe
+    the agent or workflow under evaluation.  ``FoundryEvals`` translates
+    instances into the underlying ``*EvaluatorGenerationJobSource`` SDK
+    types.
+
+    Discriminated by :attr:`type`:
+
+    * ``"prompt"`` - a free-form textual dossier (typical for local agents
+      and workflows whose tools cannot be fetched server-side).
+    * ``"agent"`` - a hosted Foundry agent referenced by name so the
+      service fetches tool definitions and metadata directly.
+    * ``"dataset"`` - a Foundry dataset of recorded interactions.
+    * ``"traces"`` - tracing data scoped by metadata.
+
+    Only the fields relevant to :attr:`type` are populated; the remaining
+    fields stay ``None``.
+
+    Attributes:
+        type: Source kind.  See discriminator above.
+        description: Optional short description shown in Foundry UI.
+        prompt: Rendered dossier for ``type="prompt"`` sources.
+        agent_name: Hosted Foundry agent name for ``type="agent"`` sources.
+        agent_version: Optional pinned hosted-agent version for
+            ``type="agent"`` sources.  ``None`` resolves to the latest
+            version at generation time; pin for reproducible runs.
+        dataset_name: Foundry dataset name for ``type="dataset"`` sources.
+        dataset_version: Pinned dataset version (recommended for repro).
+        metadata: Free-form metadata.  Used by ``type="traces"`` sources
+            for tracing-attribute filters and as a generic escape hatch
+            for additional fields not yet modeled.
+    """
+
+    type: Literal["prompt", "dataset", "agent", "traces"]
+    description: str | None = None
+    prompt: str | None = None
+    agent_name: str | None = None
+    agent_version: str | None = None
+    dataset_name: str | None = None
+    dataset_version: str | None = None
+    metadata: dict[str, Any] | None = None
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+def agent_as_eval_source(
+    agent: BaseAgent,
+    *,
+    include_instructions: bool = True,
+    include_tools: bool = True,
+    include_context_providers: bool = False,
+    include_examples: bool = False,
+    examples: Sequence[str] | None = None,
+    hosted_agent_name: str | None = None,
+    hosted_agent_version: str | None = None,
+    force_prompt_source: bool = False,
+) -> EvalGenerationSource:
+    """Render an agent as an :class:`EvalGenerationSource` for rubric generation.
+
+    Picks the best Foundry source variant for the supplied agent:
+
+    * **Hosted Foundry agents** (``FoundryAgent`` connected to a Prompt
+      Agent or Hosted Agent in a Foundry project) are emitted as
+      ``type="agent"`` sources keyed by ``agent_name`` so the service
+      fetches instructions, tools, and metadata directly from the agent
+      registry — independent of whatever the local wrapper happens to
+      hold.  Detected automatically from ``agent.chat_client.agent_name``
+      and ``agent.chat_client.agent_version``.
+    * **Local agents** (any other ``BaseAgent`` whose instructions and
+      tools live client-side, e.g. ``FoundryChatClient``-backed agents or
+      pure OpenAI Responses agents) are emitted as ``type="prompt"``
+      sources with a rendered text dossier.
+
+    Override the heuristic by passing ``hosted_agent_name`` explicitly
+    (forces an ``"agent"`` source) or ``force_prompt_source=True``
+    (forces a ``"prompt"`` source — useful when you want the service to
+    score a hosted agent against the *local* wrapper's overrides).
+
+    Args:
+        agent: Agent instance (typically a ``BaseAgent`` subclass).
+        include_instructions: Whether to include the agent's instructions
+            text in the dossier (``"prompt"`` sources only).  Defaults to
+            ``True``.
+        include_tools: Whether to include tool definitions in the dossier
+            (``"prompt"`` sources only).  Defaults to ``True``.
+        include_context_providers: Whether to include the names of
+            attached context-provider classes in the dossier
+            (``"prompt"`` sources only).  Defaults to ``False`` to avoid
+            leaking implementation details.
+        include_examples: Whether to include the supplied ``examples`` in
+            the dossier (``"prompt"`` sources only).  Defaults to
+            ``False`` to avoid shipping potentially sensitive sample
+            inputs by default.
+        examples: Optional sample queries / interactions to include when
+            ``include_examples`` is ``True``.
+        hosted_agent_name: When set, emit a ``type="agent"`` source
+            referencing this hosted Foundry agent name regardless of
+            auto-detection.  Use to override or supplement the
+            heuristic.
+        hosted_agent_version: When set together with a hosted-agent
+            source, pins the source to a specific hosted-agent version.
+            Recommended for reproducible rubric generation against
+            PromptAgents.
+        force_prompt_source: When ``True``, always emit a
+            ``type="prompt"`` source with the rendered dossier even when
+            the agent is a hosted Foundry agent.  Useful when the local
+            wrapper holds overrides the service-side agent doesn't see.
+
+    Returns:
+        An :class:`EvalGenerationSource` describing the agent.
+    """
+    agent_description = getattr(agent, "description", None)
+
+    resolved_name = hosted_agent_name
+    resolved_version = hosted_agent_version
+    if resolved_name is None and not force_prompt_source:
+        detected_name, detected_version = _detect_hosted_foundry_agent(agent)
+        if detected_name is not None:
+            resolved_name = detected_name
+            if resolved_version is None:
+                resolved_version = detected_version
+
+    if resolved_name is not None and not force_prompt_source:
+        return EvalGenerationSource(
+            type="agent",
+            agent_name=resolved_name,
+            agent_version=resolved_version,
+            description=agent_description,
+        )
+
+    prompt = agent.as_eval_source(
+        include_instructions=include_instructions,
+        include_tools=include_tools,
+        include_context_providers=include_context_providers,
+        include_examples=include_examples,
+        examples=examples,
+    )
+    return EvalGenerationSource(
+        type="prompt",
+        prompt=prompt,
+        description=agent_description,
+    )
+
+
+def _detect_hosted_foundry_agent(agent: BaseAgent) -> tuple[str | None, str | None]:
+    """Return ``(agent_name, agent_version)`` for hosted Foundry agents, else ``(None, None)``.
+
+    A hosted Foundry agent is one whose ``chat_client`` exposes a string
+    ``agent_name`` — the convention used by ``RawFoundryAgentChatClient``
+    when ``FoundryAgent`` connects to an existing Prompt Agent or Hosted
+    Agent in a Foundry project.  Only string values are accepted so
+    test doubles using ``MagicMock`` for ``chat_client`` are not
+    mis-detected.
+    """
+    chat_client = getattr(agent, "chat_client", None)
+    if chat_client is None:
+        return None, None
+    name = getattr(chat_client, "agent_name", None)
+    version = getattr(chat_client, "agent_version", None)
+    if not isinstance(name, str) or not name:
+        return None, None
+    if not isinstance(version, str) or not version:
+        version = None
+    return name, version
+
+
+@experimental(feature_id=ExperimentalFeature.EVALS)
+def workflow_as_eval_source(
+    workflow: Workflow,
+    *,
+    include_instructions: bool = True,
+    include_tools: bool = True,
+    include_context_providers: bool = False,
+    include_examples: bool = False,
+    examples: Sequence[str] | None = None,
+    include_topology: bool = True,
+) -> EvalGenerationSource:
+    """Render a workflow as an :class:`EvalGenerationSource` for rubric generation.
+
+    Wraps :meth:`Workflow.as_eval_source` to package the workflow's
+    rendered dossier (workflow name, description, topology, per-agent
+    dossiers) into a typed ``type="prompt"`` Foundry generation source.
+
+    Args:
+        workflow: Workflow instance to render.
+        include_instructions: Per-agent instructions inclusion.
+        include_tools: Per-agent tools inclusion.
+        include_context_providers: Per-agent context-provider inclusion.
+            Defaults to ``False``.
+        include_examples: Per-agent examples inclusion.  Defaults to
+            ``False``.
+        examples: Optional workflow-level sample queries.  Rendered into
+            a top-level ``Examples:`` section when ``include_examples`` is
+            ``True``.
+        include_topology: Whether to embed the JSON-encoded workflow
+            topology produced by :meth:`Workflow.to_dict`.  Defaults to
+            ``True``.
+
+    Returns:
+        A ``type="prompt"`` :class:`EvalGenerationSource` describing the
+        workflow.
+    """
+    prompt = workflow.as_eval_source(
+        include_instructions=include_instructions,
+        include_tools=include_tools,
+        include_context_providers=include_context_providers,
+        include_examples=include_examples,
+        examples=examples,
+        include_topology=include_topology,
+    )
+    return EvalGenerationSource(
+        type="prompt",
+        prompt=prompt,
+        description=workflow.description,
+    )
+
+
+# endregion
 # Agent evaluators that accept query/response as conversation arrays.
 # Maintained manually — check https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk
 # for the latest evaluator list. These are the evaluators that need conversation-format input.
@@ -166,7 +492,7 @@ def _resolve_evaluator(name: str) -> str:
 
 
 def _build_testing_criteria(
-    evaluators: Sequence[str],
+    evaluators: Sequence[str | GeneratedEvaluatorRef],
     model: str,
     *,
     include_data_mapping: bool = False,
@@ -175,7 +501,9 @@ def _build_testing_criteria(
     """Build ``testing_criteria`` for ``evals.create()``.
 
     Args:
-        evaluators: Evaluator names.
+        evaluators: Evaluator names (built-in shorts / fully-qualified
+            ``builtin.*`` names) or :class:`GeneratedEvaluatorRef`
+            instances for generated rubric evaluators.
         model: Model deployment for the LLM judge.
         include_data_mapping: Whether to include field-level data mapping
             (required for the JSONL data source, not needed for response-based).
@@ -183,7 +511,38 @@ def _build_testing_criteria(
             definitions.
     """
     criteria: list[dict[str, Any]] = []
-    for name in evaluators:
+    for entry_spec in evaluators:
+        if isinstance(entry_spec, GeneratedEvaluatorRef):
+            short = entry_spec.display_name or entry_spec.name
+            ref_entry: dict[str, Any] = {
+                "type": "azure_ai_evaluator",
+                "name": short,
+                "evaluator_name": entry_spec.name,
+                "initialization_parameters": {"deployment_name": model},
+            }
+            if entry_spec.version is not None:
+                ref_entry["evaluator_version"] = entry_spec.version
+            else:
+                logger.warning(
+                    "GeneratedEvaluatorRef '%s' has no pinned version; the eval run "
+                    "will resolve to whichever version is current at execution time. "
+                    "Pin the version for reproducible runs.",
+                    entry_spec.name,
+                )
+            if include_data_mapping:
+                # Rubric evaluators accept conversation arrays like agent
+                # evaluators, plus tool_definitions when items are tool-aware.
+                ref_mapping: dict[str, str] = {
+                    "query": "{{item.query_messages}}",
+                    "response": "{{item.response_messages}}",
+                }
+                if include_tool_definitions:
+                    ref_mapping["tool_definitions"] = "{{item.tool_definitions}}"
+                ref_entry["data_mapping"] = ref_mapping
+            criteria.append(ref_entry)
+            continue
+
+        name = entry_spec
         qualified = _resolve_evaluator(name)
         short = name if not name.startswith("builtin.") else name.split(".")[-1]
 
@@ -247,9 +606,9 @@ def _build_item_schema(
 
 
 def _resolve_default_evaluators(
-    evaluators: Sequence[str] | None,
+    evaluators: Sequence[str | GeneratedEvaluatorRef] | None,
     items: Sequence[EvalItem | dict[str, Any]] | None = None,
-) -> list[str]:
+) -> list[str | GeneratedEvaluatorRef]:
     """Resolve evaluators, applying defaults when ``None``.
 
     Defaults to relevance + coherence + task_adherence. Automatically adds
@@ -258,7 +617,7 @@ def _resolve_default_evaluators(
     if evaluators is not None:
         return list(evaluators)
 
-    result = list(_DEFAULT_EVALUATORS)
+    result: list[str | GeneratedEvaluatorRef] = list(_DEFAULT_EVALUATORS)
     if items is not None:
         has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items)
         if has_tools:
@@ -267,14 +626,24 @@ def _resolve_default_evaluators(
 
 
 def _filter_tool_evaluators(
-    evaluators: list[str],
+    evaluators: list[str | GeneratedEvaluatorRef],
     items: Sequence[EvalItem | dict[str, Any]],
-) -> list[str]:
-    """Remove tool evaluators if no items have tool definitions."""
+) -> list[str | GeneratedEvaluatorRef]:
+    """Remove tool evaluators if no items have tool definitions.
+
+    Generated rubric evaluators are tool-aware but not tool-required; they
+    are preserved regardless of whether items carry tool definitions.
+    """
     has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items)
     if has_tools:
         return evaluators
-    filtered = [e for e in evaluators if _resolve_evaluator(e) not in _TOOL_EVALUATORS]
+
+    def _is_tool_only(spec: str | GeneratedEvaluatorRef) -> bool:
+        if isinstance(spec, GeneratedEvaluatorRef):
+            return False
+        return _resolve_evaluator(spec) in _TOOL_EVALUATORS
+
+    filtered = [e for e in evaluators if not _is_tool_only(e)]
     if not filtered:
         raise ValueError(
             f"All requested evaluators {evaluators} require tool definitions, "
@@ -282,7 +651,7 @@ def _filter_tool_evaluators(
             "or choose evaluators that do not require tools."
         )
     if len(filtered) < len(evaluators):
-        removed = [e for e in evaluators if _resolve_evaluator(e) in _TOOL_EVALUATORS]
+        removed = [e for e in evaluators if _is_tool_only(e)]
         logger.info("Removed tool evaluators %s (no items have tools)", removed)
     return filtered
 
@@ -354,6 +723,79 @@ def _extract_per_evaluator(run: RunRetrieveResponse) -> dict[str, dict[str, int]
     return per_eval
 
 
+def _extract_rubric_scores(sample: Any) -> list[RubricScore] | None:
+    """Extract typed ``RubricScore`` instances from an evaluator's raw sample payload.
+
+    Foundry rubric evaluators include a per-dimension breakdown under
+    ``properties.rubric_scores`` on each result.  The exact location may
+    vary across SDK versions, so this helper accepts a few shapes:
+
+    * The SDK ``sample`` object exposes ``properties.rubric_scores``.
+    * The ``sample`` is a dict containing ``properties.rubric_scores``.
+    * The ``sample`` is a dict with ``rubric_scores`` at the top level.
+
+    Returns ``None`` when no rubric scores are present (i.e. the
+    evaluator was not a rubric evaluator).
+    """
+    if sample is None:
+        return None
+
+    raw: Any = None
+    properties: Any = getattr(sample, "properties", None)
+    if properties is not None:
+        raw = getattr(properties, "rubric_scores", None)
+        if raw is None and isinstance(properties, dict):
+            raw = cast("dict[str, Any]", properties).get("rubric_scores")
+    if raw is None and isinstance(sample, dict):
+        sample_any = cast("dict[str, Any]", sample)
+        props_dict: Any = sample_any.get("properties")
+        if isinstance(props_dict, dict):
+            raw = cast("dict[str, Any]", props_dict).get("rubric_scores")
+        if raw is None:
+            raw = sample_any.get("rubric_scores")
+
+    if not raw:
+        return None
+
+    parsed: list[RubricScore] = []
+    raw_iter: Any = raw
+    for raw_entry in raw_iter:
+        entry: Any = raw_entry
+        try:
+            rid: Any
+            score_val: Any
+            applicable: Any
+            weight: Any
+            reason: Any
+            if isinstance(entry, dict):
+                entry_any = cast("dict[str, Any]", entry)
+                rid = entry_any.get("id")
+                score_val = entry_any.get("score")
+                applicable = entry_any.get("applicable")
+                weight = entry_any.get("weight")
+                reason = entry_any.get("reason", "")
+            else:
+                rid = getattr(entry, "id", None)
+                score_val = getattr(entry, "score", None)
+                applicable = getattr(entry, "applicable", None)
+                weight = getattr(entry, "weight", None)
+                reason = getattr(entry, "reason", "") or ""
+            if rid is None or weight is None or applicable is None:
+                continue
+            parsed.append(
+                RubricScore(
+                    id=str(rid),
+                    score=int(score_val) if isinstance(score_val, (int, float)) else None,
+                    applicable=bool(applicable),
+                    weight=int(weight),
+                    reason=str(reason) if reason is not None else "",
+                )
+            )
+        except (TypeError, ValueError):
+            logger.debug("Skipping malformed rubric_scores entry: %s", cast("Any", entry), exc_info=True)
+    return parsed or None
+
+
 async def _fetch_output_items(
     client: AsyncOpenAI,
     eval_id: str,
@@ -377,12 +819,15 @@ async def _fetch_output_items(
             # Extract per-evaluator scores
             scores: list[EvalScoreResult] = []
             for r in oi.results or []:
+                sample = r.sample
+                dimensions = _extract_rubric_scores(sample)
                 scores.append(
                     EvalScoreResult(
                         name=r.name,
                         score=r.score,
                         passed=r.passed,
-                        sample=r.sample,
+                        sample=sample,
+                        dimensions=dimensions,
                     )
                 )
 
@@ -472,7 +917,7 @@ async def _evaluate_via_responses_impl(
     *,
     client: AsyncOpenAI,
     response_ids: Sequence[str],
-    evaluators: list[str],
+    evaluators: list[str | GeneratedEvaluatorRef],
     model: str,
     eval_name: str,
     poll_interval: float,
@@ -573,8 +1018,11 @@ class FoundryEvals:
             (from ``azure.ai.projects.aio``).  Provide this or *client*.
         model: Model deployment name for the evaluator LLM judge.
             Resolved from ``client.model`` when omitted.
-        evaluators: Evaluator names (e.g. ``["relevance", "tool_call_accuracy"]``).
-            When ``None`` (default), uses smart defaults based on item data.
+        evaluators: Evaluator specifications.  Entries may be built-in
+            short names (e.g. ``"relevance"``), fully-qualified
+            ``"builtin.*"`` names, or :class:`GeneratedEvaluatorRef`
+            instances for previously generated rubric evaluators.  When
+            ``None`` (default), uses smart defaults based on item data.
         conversation_split: How to split multi-turn conversations into
             query/response halves.  Defaults to ``LAST_TURN``.  Pass a
             ``ConversationSplit`` enum value or a custom callable — see
@@ -623,7 +1071,7 @@ def __init__(
         client: FoundryChatClient | None = None,
         project_client: AIProjectClient | None = None,
         model: str | None = None,
-        evaluators: Sequence[str] | None = None,
+        evaluators: Sequence[str | GeneratedEvaluatorRef] | None = None,
         conversation_split: ConversationSplitter = ConversationSplit.LAST_TURN,
         poll_interval: float = 5.0,
         timeout: float = 180.0,
@@ -642,7 +1090,9 @@ def __init__(
                 "Model is required. Pass model= explicitly or use a FoundryChatClient that has a model configured."
             )
         self._model = resolved_model
-        self._evaluators = list(evaluators) if evaluators is not None else None
+        self._evaluators: list[str | GeneratedEvaluatorRef] | None = (
+            list(evaluators) if evaluators is not None else None
+        )
         self._conversation_split = conversation_split
         self._poll_interval = poll_interval
         self._timeout = timeout
@@ -678,7 +1128,7 @@ async def evaluate(
     async def _evaluate_via_dataset(
         self,
         items: Sequence[EvalItem],
-        evaluators: list[str],
+        evaluators: list[str | GeneratedEvaluatorRef],
         eval_name: str,
     ) -> EvalResults:
         """Evaluate using JSONL dataset upload path."""
@@ -752,6 +1202,334 @@ async def _evaluate_via_dataset(
             provider=self.name,
         )
 
+    @classmethod
+    @experimental(feature_id=ExperimentalFeature.EVALS)
+    async def generate_rubric(
+        cls,
+        *,
+        project_client: AIProjectClient,
+        name: str,
+        agent: BaseAgent | None = None,
+        workflow: Workflow | None = None,
+        sources: Sequence[EvalGenerationSource] | None = None,
+        category: Literal["quality", "safety"] = "quality",
+        model: str | None = None,
+        display_name: str | None = None,
+        description: str | None = None,
+        operation_id: str | None = None,
+        poll_interval: float = 5.0,
+        timeout: float = 600.0,
+    ) -> GeneratedEvaluatorRef:
+        """Generate a Foundry rubric evaluator from an agent or workflow.
+
+        Drives the Foundry evaluator-generation long-running operation
+        (``client.beta.evaluators.create_generation_job``) end-to-end and
+        returns a pinned :class:`GeneratedEvaluatorRef` for use with
+        :class:`FoundryEvals` ``evaluators=`` lists.
+
+        Exactly one of ``agent``, ``workflow``, or ``sources`` must be
+        supplied.  When ``agent`` or ``workflow`` is given,
+        :func:`agent_as_eval_source` / :func:`workflow_as_eval_source` is
+        used to build a single conservative source (instructions and
+        tools included; examples and context providers excluded).  Pass
+        ``sources=`` directly to control inclusion explicitly or to
+        provide multiple sources.
+
+        Requires ``azure-ai-projects`` with the rubric-generation APIs
+        (currently ``2.3.0a*`` on the Azure SDK dev feed; tracked for an
+        upcoming PyPI release).  Raises :class:`NotImplementedError` with
+        a clear message when the dependency is unavailable.
+
+        Keyword Args:
+            project_client: Async ``AIProjectClient`` for the target
+                Foundry project.
+            name: Evaluator name to register in the project.  Must be a
+                stable identifier (e.g. ``"policy-enforcement-v1"``).
+            agent: Optional ``BaseAgent`` to derive a source from.
+            workflow: Optional ``Workflow`` to derive a source from.
+            sources: Explicit list of :class:`EvalGenerationSource`
+                instances.  Mutually exclusive with ``agent`` / ``workflow``.
+            category: ``"quality"`` or ``"safety"``.  Defaults to
+                ``"quality"``.
+            model: Optional model deployment to drive generation.  When
+                omitted the service picks a default.
+            display_name: Optional human-readable name for the evaluator.
+            description: Optional description for the evaluator.
+            operation_id: Optional caller-supplied operation id to make
+                the create call idempotent.
+            poll_interval: Seconds between job-status polls.
+            timeout: Maximum seconds to wait for the job to complete.
+
+        Returns:
+            A pinned :class:`GeneratedEvaluatorRef` referring to the
+            newly created evaluator.
+
+        Raises:
+            ValueError: If the source arguments are inconsistent.
+            NotImplementedError: If the installed ``azure-ai-projects``
+                version does not expose the rubric APIs.
+            TimeoutError: If the job does not complete within ``timeout``.
+            RuntimeError: If the generation job ends in a non-succeeded
+                terminal state.
+        """
+        resolved_sources = _coalesce_generation_sources(agent=agent, workflow=workflow, sources=sources)
+
+        if category not in ("quality", "safety"):
+            raise ValueError(f"category must be 'quality' or 'safety', got {category!r}.")
+
+        try:
+            sdk_types = _import_generation_sdk_types()
+        except _RubricSdkUnavailableError as exc:
+            raise NotImplementedError(str(exc)) from exc
+
+        sdk_sources = [_to_sdk_source(s, sdk_types) for s in resolved_sources]
+
+        inputs_kwargs: dict[str, Any] = {
+            "name": name,
+            "category": category,
+            "sources": sdk_sources,
+        }
+        if model is not None:
+            inputs_kwargs["model"] = model
+        if display_name is not None:
+            inputs_kwargs["display_name"] = display_name
+        if description is not None:
+            inputs_kwargs["description"] = description
+
+        inputs = sdk_types.EvaluatorGenerationInputs(**inputs_kwargs)
+        job = sdk_types.EvaluatorGenerationJob(inputs=inputs)
+
+        create_kwargs: dict[str, Any] = {"job": job}
+        if operation_id is not None:
+            create_kwargs["operation_id"] = operation_id
+
+        evaluators_ops = _get_beta_evaluators(project_client)
+        created = await evaluators_ops.create_generation_job(**create_kwargs)
+        completed = await _poll_generation_job(
+            evaluators_ops,
+            created,
+            poll_interval=poll_interval,
+            timeout=timeout,
+        )
+
+        return _generation_job_to_ref(completed, category=category)
+
+
+_TERMINAL_GENERATION_STATUSES: frozenset[str] = frozenset({"succeeded", "failed", "cancelled", "canceled"})
+
+
+class _RubricSdkUnavailableError(Exception):
+    """Raised when azure-ai-projects lacks the rubric-generation APIs."""
+
+
+@dataclass(frozen=True)
+class _GenerationSdkTypes:
+    """Resolved SDK type handles for rubric-evaluator generation."""
+
+    EvaluatorGenerationInputs: Any
+    EvaluatorGenerationJob: Any
+    PromptSource: Any
+    AgentSource: Any | None
+    DatasetSource: Any | None
+    TracesSource: Any | None
+
+
+_RUBRIC_SDK_MISSING_MSG = (
+    "FoundryEvals.generate_rubric requires the rubric-evaluator generation APIs "
+    "from azure-ai-projects (currently 2.3.0a* on the Azure SDK Python dev feed). "
+    "Install a build that exposes "
+    "`azure.ai.projects.models.EvaluatorGenerationInputs` and "
+    "`AIProjectClient.beta.evaluators.create_generation_job`."
+)
+
+
+def _import_generation_sdk_types() -> _GenerationSdkTypes:
+    """Lazily resolve the rubric-generation SDK types from azure-ai-projects."""
+    try:
+        from azure.ai.projects import models as _models  # type: ignore[import-not-found]
+    except ImportError as exc:
+        raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG) from exc
+
+    models_mod: Any = _models
+    inputs_cls: Any = getattr(models_mod, "EvaluatorGenerationInputs", None)
+    job_cls: Any = getattr(models_mod, "EvaluatorGenerationJob", None)
+    prompt_cls: Any = getattr(models_mod, "PromptEvaluatorGenerationJobSource", None)
+    if inputs_cls is None or job_cls is None or prompt_cls is None:
+        raise _RubricSdkUnavailableError(_RUBRIC_SDK_MISSING_MSG)
+
+    agent_cls: Any = getattr(models_mod, "AgentEvaluatorGenerationJobSource", None)
+    dataset_cls: Any = getattr(models_mod, "DatasetEvaluatorGenerationJobSource", None)
+    traces_cls: Any = getattr(models_mod, "TracesEvaluatorGenerationJobSource", None)
+
+    return _GenerationSdkTypes(
+        EvaluatorGenerationInputs=inputs_cls,
+        EvaluatorGenerationJob=job_cls,
+        PromptSource=prompt_cls,
+        AgentSource=agent_cls,
+        DatasetSource=dataset_cls,
+        TracesSource=traces_cls,
+    )
+
+
+def _get_beta_evaluators(project_client: AIProjectClient) -> Any:
+    """Return the ``project_client.beta.evaluators`` operations group, or raise."""
+    beta = getattr(project_client, "beta", None)
+    evaluators_ops = getattr(beta, "evaluators", None) if beta is not None else None
+    if evaluators_ops is None:
+        raise NotImplementedError(_RUBRIC_SDK_MISSING_MSG)
+    return evaluators_ops
+
+
+def _coalesce_generation_sources(
+    *,
+    agent: BaseAgent | None,
+    workflow: Workflow | None,
+    sources: Sequence[EvalGenerationSource] | None,
+) -> list[EvalGenerationSource]:
+    if sources is not None and not sources:
+        raise ValueError("sources= must contain at least one EvalGenerationSource.")
+    supplied = [bool(agent), bool(workflow), bool(sources)]
+    if sum(supplied) == 0:
+        raise ValueError("Provide one of agent=, workflow=, or sources=.")
+    if sum(supplied) > 1:
+        raise ValueError("Provide only one of agent=, workflow=, or sources=.")
+    if sources is not None:
+        return list(sources)
+    if agent is not None:
+        return [agent_as_eval_source(agent)]
+    if workflow is None:
+        raise ValueError("workflow= must be provided when agent= and sources= are not set.")
+    return [workflow_as_eval_source(workflow)]
+
+
+def _to_sdk_source(source: EvalGenerationSource, sdk_types: _GenerationSdkTypes) -> Any:
+    """Translate an :class:`EvalGenerationSource` to its SDK counterpart."""
+    if source.type == "prompt":
+        if not source.prompt:
+            raise ValueError("EvalGenerationSource(type='prompt') requires a non-empty prompt.")
+        kwargs: dict[str, Any] = {"prompt": source.prompt}
+        if source.description is not None:
+            kwargs["description"] = source.description
+        return sdk_types.PromptSource(**kwargs)
+    if source.type == "agent":
+        if sdk_types.AgentSource is None:
+            raise NotImplementedError("Installed azure-ai-projects does not expose AgentEvaluatorGenerationJobSource.")
+        if not source.agent_name:
+            raise ValueError("EvalGenerationSource(type='agent') requires agent_name.")
+        kwargs = {"agent_name": source.agent_name}
+        if source.agent_version is not None:
+            kwargs["agent_version"] = source.agent_version
+        if source.description is not None:
+            kwargs["description"] = source.description
+        return sdk_types.AgentSource(**kwargs)
+    if source.type == "dataset":
+        if sdk_types.DatasetSource is None:
+            raise NotImplementedError(
+                "Installed azure-ai-projects does not expose DatasetEvaluatorGenerationJobSource."
+            )
+        if not source.dataset_name:
+            raise ValueError("EvalGenerationSource(type='dataset') requires dataset_name.")
+        # SDK uses ``name`` / ``version`` (not ``dataset_name`` / ``dataset_version``).
+        kwargs = {"name": source.dataset_name}
+        if source.dataset_version is not None:
+            kwargs["version"] = source.dataset_version
+        if source.description is not None:
+            kwargs["description"] = source.description
+        return sdk_types.DatasetSource(**kwargs)
+    if source.type == "traces":
+        if sdk_types.TracesSource is None:
+            raise NotImplementedError("Installed azure-ai-projects does not expose TracesEvaluatorGenerationJobSource.")
+        kwargs = {}
+        if source.metadata is not None:
+            kwargs["metadata"] = source.metadata
+        if source.description is not None:
+            kwargs["description"] = source.description
+        return sdk_types.TracesSource(**kwargs)
+    raise ValueError(f"Unknown EvalGenerationSource type: {source.type!r}")
+
+
+async def _poll_generation_job(
+    evaluators_ops: Any,
+    job: Any,
+    *,
+    poll_interval: float,
+    timeout: float,
+) -> Any:
+    """Poll a rubric-generation job until it reaches a terminal state."""
+    job_id = getattr(job, "id", None)
+    if not job_id:
+        raise RuntimeError("Rubric generation job did not return an id.")
+
+    loop = asyncio.get_running_loop()
+    deadline = loop.time() + timeout
+    current = job
+    while True:
+        status = (getattr(current, "status", "") or "").lower()
+        if status in _TERMINAL_GENERATION_STATUSES:
+            if status != "succeeded":
+                err = getattr(current, "error", None)
+                err_msg = getattr(err, "message", None) or str(err) if err is not None else status
+                raise RuntimeError(f"Rubric generation job {job_id} ended in status {status!r}: {err_msg}")
+            return current
+        remaining = deadline - loop.time()
+        if remaining <= 0:
+            raise TimeoutError(
+                f"Rubric generation job {job_id} did not complete within {timeout}s (last status: {status!r})."
+            )
+        await asyncio.sleep(min(poll_interval, remaining))
+        current = await evaluators_ops.get_generation_job(job_id)
+
+
+def _generation_job_to_ref(job: Any, *, category: Literal["quality", "safety"]) -> GeneratedEvaluatorRef:
+    """Build a pinned :class:`GeneratedEvaluatorRef` from a completed job."""
+    artifacts: Any = getattr(job, "artifacts", None)
+    evaluator: Any = getattr(artifacts, "evaluator", None) if artifacts is not None else None
+    if evaluator is None:
+        raise RuntimeError("Rubric generation job completed without an evaluator artifact.")
+
+    ev_name = getattr(evaluator, "name", None)
+    ev_version = getattr(evaluator, "version", None)
+    if not ev_name:
+        raise RuntimeError("Generated evaluator artifact is missing a name.")
+    if ev_version is None:
+        raise RuntimeError("Generated evaluator artifact is missing a version.")
+
+    definition: Any = getattr(evaluator, "definition", None)
+    dimensions_raw: Any = getattr(definition, "dimensions", None) if definition is not None else None
+    dimensions: tuple[RubricDimension, ...] | None = None
+    if dimensions_raw:
+        parsed: list[RubricDimension] = []
+        for entry in dimensions_raw:
+            try:
+                parsed.append(
+                    RubricDimension(
+                        id=str(getattr(entry, "id", "") or ""),
+                        description=str(getattr(entry, "description", "") or ""),
+                        weight=int(getattr(entry, "weight", 0) or 0),
+                        always_applicable=bool(getattr(entry, "always_applicable", False)),
+                    )
+                )
+            except (TypeError, ValueError):
+                logger.debug("Skipping malformed dimension on generated evaluator", exc_info=True)
+        if parsed:
+            dimensions = tuple(parsed)
+
+    pass_threshold: float | None = None
+    if definition is not None:
+        raw_threshold = getattr(definition, "pass_threshold", None)
+        if isinstance(raw_threshold, (int, float)):
+            pass_threshold = float(raw_threshold)
+
+    return GeneratedEvaluatorRef(
+        name=str(ev_name),
+        version=str(ev_version),
+        category=category,
+        display_name=getattr(evaluator, "display_name", None),
+        description=getattr(evaluator, "description", None),
+        dimensions=dimensions,
+        pass_threshold=pass_threshold,
+    )
+
 
 # ---------------------------------------------------------------------------
 # Foundry-specific functions (not part of the Evaluator protocol)
diff --git a/python/packages/foundry/tests/test_evals_config.py b/python/packages/foundry/tests/test_evals_config.py
new file mode 100644
index 0000000000..a1c86187d4
--- /dev/null
+++ b/python/packages/foundry/tests/test_evals_config.py
@@ -0,0 +1,273 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Tests for the YAML-driven evaluator configuration loader."""
+
+from __future__ import annotations
+
+import textwrap
+from pathlib import Path
+from typing import Any
+from unittest.mock import MagicMock
+
+import pytest
+
+from agent_framework_foundry._evals_config import (
+    RubricGenerationSpec,
+    RubricSourceSpec,
+    build_sources,
+    load_evals_config,
+    parse_evals_config,
+)
+from agent_framework_foundry._foundry_evals import EvalGenerationSource
+
+
+def _make_agent(name: str = "agent-a", instructions: str = "Be brief.") -> Any:
+    from agent_framework._evaluation import _render_agent_dossier
+
+    agent = MagicMock()
+    agent.name = name
+    agent.description = f"{name} description"
+    agent.default_options = {"instructions": instructions, "tools": []}
+    agent.context_providers = []
+    agent.mcp_tools = []
+    agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier(
+        agent,
+        include_instructions=kw.get("include_instructions", True),
+        include_tools=kw.get("include_tools", True),
+        include_context_providers=kw.get("include_context_providers", False),
+        include_examples=kw.get("include_examples", False),
+        examples=kw.get("examples"),
+    )
+    return agent
+
+
+def _make_workflow() -> Any:
+    from agent_framework._evaluation import _render_workflow_dossier
+
+    workflow = MagicMock()
+    workflow.name = "wf-1"
+    workflow.description = "demo"
+    workflow.to_dict.return_value = {"name": "wf-1", "id": "wf_1", "executors": {}, "edge_groups": []}
+    workflow.executors = {}
+    workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier(
+        workflow,
+        include_instructions=kw.get("include_instructions", True),
+        include_tools=kw.get("include_tools", True),
+        include_context_providers=kw.get("include_context_providers", False),
+        include_examples=kw.get("include_examples", False),
+        examples=kw.get("examples"),
+        include_topology=kw.get("include_topology", True),
+    )
+    return workflow
+
+
+class TestParseEvalsConfig:
+    """Parsing already-loaded dicts into RubricGenerationSpec instances."""
+
+    def test_minimal_spec(self) -> None:
+        config = parse_evals_config({
+            "evaluators": {
+                "my-rubric": {
+                    "type": "foundry.generated_rubric",
+                }
+            }
+        })
+        assert "my-rubric" in config
+        spec = config["my-rubric"]
+        assert spec.name == "my-rubric"
+        assert spec.type == "foundry.generated_rubric"
+        assert spec.category == "quality"
+        assert spec.sources == ()
+
+    def test_full_spec_with_sources(self) -> None:
+        config = parse_evals_config({
+            "evaluators": {
+                "reservation-quality": {
+                    "type": "foundry.generated_rubric",
+                    "category": "quality",
+                    "model": "gpt-4o",
+                    "agent": "reservation-agent",
+                    "display_name": "Reservation Quality",
+                    "description": "Custom rubric for reservation agent.",
+                    "sources": [
+                        {
+                            "type": "agent",
+                            "include_instructions": True,
+                            "include_tools": True,
+                            "include_context_providers": True,
+                        },
+                        {
+                            "type": "dataset",
+                            "name": "reservation-business-rules",
+                            "version": 1,
+                        },
+                    ],
+                }
+            }
+        })
+        spec = config["reservation-quality"]
+        assert spec.model == "gpt-4o"
+        assert spec.agent == "reservation-agent"
+        assert spec.display_name == "Reservation Quality"
+        assert len(spec.sources) == 2
+
+        agent_src = spec.sources[0]
+        assert agent_src.type == "agent"
+        assert agent_src.include_context_providers is True
+
+        dataset_src = spec.sources[1]
+        assert dataset_src.type == "dataset"
+        assert dataset_src.name == "reservation-business-rules"
+        assert dataset_src.version == "1"  # coerced to string
+
+    def test_rejects_non_mapping(self) -> None:
+        with pytest.raises(ValueError, match="must be a mapping"):
+            parse_evals_config([])
+
+    def test_rejects_missing_evaluators_key(self) -> None:
+        with pytest.raises(ValueError, match="evaluators"):
+            parse_evals_config({"other": {}})
+
+    def test_rejects_unknown_type(self) -> None:
+        with pytest.raises(ValueError, match="unsupported type"):
+            parse_evals_config({"evaluators": {"x": {"type": "foundry.other"}}})
+
+    def test_rejects_invalid_category(self) -> None:
+        with pytest.raises(ValueError, match="invalid category"):
+            parse_evals_config({"evaluators": {"x": {"type": "foundry.generated_rubric", "category": "bogus"}}})
+
+    def test_rejects_invalid_source_type(self) -> None:
+        with pytest.raises(ValueError, match="invalid type"):
+            parse_evals_config({
+                "evaluators": {
+                    "x": {
+                        "type": "foundry.generated_rubric",
+                        "sources": [{"type": "bogus"}],
+                    }
+                }
+            })
+
+
+class TestLoadEvalsConfig:
+    """End-to-end YAML loading."""
+
+    def test_load_from_yaml_file(self, tmp_path: Path) -> None:
+        pytest.importorskip("yaml")
+        config_path = tmp_path / "evals.yaml"
+        config_path.write_text(
+            textwrap.dedent(
+                """\
+                evaluators:
+                  my-eval:
+                    type: foundry.generated_rubric
+                    category: safety
+                    model: gpt-4o-mini
+                    sources:
+                      - type: prompt
+                        prompt: "Score the response."
+                """
+            ),
+            encoding="utf-8",
+        )
+        config = load_evals_config(config_path)
+        assert "my-eval" in config
+        spec = config["my-eval"]
+        assert spec.category == "safety"
+        assert spec.model == "gpt-4o-mini"
+        assert len(spec.sources) == 1
+        assert spec.sources[0].type == "prompt"
+        assert spec.sources[0].prompt == "Score the response."
+
+
+class TestBuildSources:
+    """Translate RubricGenerationSpec sources into EvalGenerationSource instances."""
+
+    def test_no_sources_with_agent_default(self) -> None:
+        spec = RubricGenerationSpec(name="x")
+        agent = _make_agent()
+        sources = build_sources(spec, agent=agent)
+        assert len(sources) == 1
+        assert sources[0].type == "prompt"
+        assert sources[0].prompt is not None
+        assert "Agent name: agent-a" in sources[0].prompt
+
+    def test_no_sources_with_workflow_default(self) -> None:
+        spec = RubricGenerationSpec(name="x")
+        workflow = _make_workflow()
+        sources = build_sources(spec, workflow=workflow)
+        assert len(sources) == 1
+        assert sources[0].type == "prompt"
+        assert sources[0].prompt is not None
+        assert "Workflow name: wf-1" in sources[0].prompt
+
+    def test_no_sources_no_agent_or_workflow_raises(self) -> None:
+        spec = RubricGenerationSpec(name="x")
+        with pytest.raises(ValueError, match="no sources"):
+            build_sources(spec)
+
+    def test_agent_source_uses_supplied_agent(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="agent", include_context_providers=True),),
+        )
+        agent = _make_agent()
+        sources = build_sources(spec, agent=agent)
+        assert sources[0].type == "prompt"
+        assert sources[0].prompt is not None
+        assert "Agent name: agent-a" in sources[0].prompt
+
+    def test_agent_source_with_agent_name_uses_hosted_path(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="agent", agent_name="hosted-foundry-agent"),),
+        )
+        sources = build_sources(spec)
+        assert sources[0].type == "agent"
+        assert sources[0].agent_name == "hosted-foundry-agent"
+
+    def test_agent_source_without_agent_raises(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="agent"),),
+        )
+        with pytest.raises(ValueError, match="no agent="):
+            build_sources(spec)
+
+    def test_workflow_source_uses_supplied_workflow(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="workflow", include_topology=False),),
+        )
+        workflow = _make_workflow()
+        sources = build_sources(spec, workflow=workflow)
+        assert sources[0].type == "prompt"
+        assert sources[0].prompt is not None
+        assert "Workflow name: wf-1" in sources[0].prompt
+        assert "Topology (JSON):" not in sources[0].prompt
+
+    def test_prompt_source_translates_directly(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="prompt", prompt="Score it."),),
+        )
+        sources = build_sources(spec)
+        assert sources[0] == EvalGenerationSource(type="prompt", prompt="Score it.")
+
+    def test_dataset_source_translates(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="dataset", name="ds", version="2"),),
+        )
+        sources = build_sources(spec)
+        assert sources[0].type == "dataset"
+        assert sources[0].dataset_name == "ds"
+        assert sources[0].dataset_version == "2"
+
+    def test_traces_source_passes_metadata(self) -> None:
+        spec = RubricGenerationSpec(
+            name="x",
+            sources=(RubricSourceSpec(type="traces", metadata={"environment": "prod"}),),
+        )
+        sources = build_sources(spec)
+        assert sources[0].type == "traces"
+        assert sources[0].metadata == {"environment": "prod"}
diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py
index a5d9f2e864..7244347e05 100644
--- a/python/packages/foundry/tests/test_foundry_evals.py
+++ b/python/packages/foundry/tests/test_foundry_evals.py
@@ -6,7 +6,7 @@
 
 import json
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, cast
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
@@ -64,6 +64,32 @@ def _make_tool(name: str) -> MagicMock:
     return t
 
 
+def _make_stub_agent(
+    *,
+    name: str = "alpha",
+    description: str = "An agent.",
+    instructions: str = "Be brief.",
+) -> MagicMock:
+    """Mock agent whose as_eval_source returns a real dossier string."""
+    from agent_framework._evaluation import _render_agent_dossier
+
+    agent = MagicMock()
+    agent.name = name
+    agent.description = description
+    agent.default_options = {"instructions": instructions, "tools": []}
+    agent.context_providers = []
+    agent.mcp_tools = []
+    agent.as_eval_source.side_effect = lambda **kw: _render_agent_dossier(
+        agent,
+        include_instructions=kw.get("include_instructions", True),
+        include_tools=kw.get("include_tools", True),
+        include_context_providers=kw.get("include_context_providers", False),
+        include_examples=kw.get("include_examples", False),
+        examples=kw.get("examples"),
+    )
+    return agent
+
+
 @dataclass
 class _MockResultCounts:
     """Mock matching the OpenAI SDK ResultCounts Pydantic model shape."""
@@ -806,6 +832,73 @@ def test_all_tool_evaluators_include_tool_definitions(self) -> None:
         for c in criteria:
             assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions"
 
+    def test_generated_evaluator_ref_pinned_version(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
+        criteria = _build_testing_criteria([ref], "gpt-4o", include_data_mapping=True)
+
+        assert len(criteria) == 1
+        c = criteria[0]
+        assert c["type"] == "azure_ai_evaluator"
+        assert c["evaluator_name"] == "my-rubric"
+        assert c["evaluator_version"] == "1"
+        assert c["name"] == "my-rubric"
+        assert c["initialization_parameters"] == {"deployment_name": "gpt-4o"}
+        assert c["data_mapping"] == {
+            "query": "{{item.query_messages}}",
+            "response": "{{item.response_messages}}",
+        }
+
+    def test_generated_evaluator_ref_display_name_used_as_short(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="2", display_name="My Rubric")
+        criteria = _build_testing_criteria([ref], "gpt-4o")
+
+        assert criteria[0]["name"] == "My Rubric"
+        assert criteria[0]["evaluator_name"] == "my-rubric"
+
+    def test_generated_evaluator_ref_tool_definitions_added(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
+        criteria = _build_testing_criteria(
+            [ref],
+            "gpt-4o",
+            include_data_mapping=True,
+            include_tool_definitions=True,
+        )
+
+        assert criteria[0]["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}"
+
+    def test_generated_evaluator_ref_unpinned_warns(self, caplog: pytest.LogCaptureFixture) -> None:
+        import logging
+
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef.latest("my-rubric")
+        with caplog.at_level(logging.WARNING, logger="agent_framework_foundry._foundry_evals"):
+            criteria = _build_testing_criteria([ref], "gpt-4o")
+
+        assert "evaluator_version" not in criteria[0]
+        assert any("no pinned version" in r.message for r in caplog.records)
+
+    def test_generated_evaluator_ref_mixed_with_builtins(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="my-rubric", version="1")
+        criteria = _build_testing_criteria(
+            ["relevance", ref, "task_adherence"],
+            "gpt-4o",
+            include_data_mapping=True,
+        )
+
+        assert [c["name"] for c in criteria] == ["relevance", "my-rubric", "task_adherence"]
+        assert criteria[0]["evaluator_name"] == "builtin.relevance"
+        assert criteria[1]["evaluator_name"] == "my-rubric"
+        assert criteria[2]["evaluator_name"] == "builtin.task_adherence"
+
 
 # ---------------------------------------------------------------------------
 # _build_item_schema
@@ -1263,6 +1356,31 @@ def test_raises_when_all_filtered(self) -> None:
                 items,
             )
 
+    def test_preserves_generated_ref_when_no_tools(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="rubric", version="1")
+        items = [
+            EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]),
+        ]
+        result = _filter_tool_evaluators(
+            ["relevance", ref, "tool_call_accuracy"],
+            items,
+        )
+        assert "relevance" in result
+        assert ref in result
+        assert "tool_call_accuracy" not in result
+
+    def test_generated_ref_alone_does_not_raise(self) -> None:
+        from agent_framework_foundry import GeneratedEvaluatorRef
+
+        ref = GeneratedEvaluatorRef(name="rubric", version="1")
+        items = [
+            EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]),
+        ]
+        result = _filter_tool_evaluators([ref], items)
+        assert result == [ref]
+
 
 # ---------------------------------------------------------------------------
 # EvalResults
@@ -2369,6 +2487,124 @@ async def test_handles_api_failure_gracefully(self) -> None:
         items = await _fetch_output_items(mock_client, "eval_1", "run_1")
         assert items == []
 
+    async def test_extracts_rubric_scores_from_dict_sample(self) -> None:
+        from agent_framework_foundry._foundry_evals import _fetch_output_items
+
+        mock_result = MagicMock()
+        mock_result.name = "my-rubric"
+        mock_result.score = 0.85
+        mock_result.passed = True
+        mock_result.sample = {
+            "properties": {
+                "rubric_scores": [
+                    {"id": "policy", "score": 4, "applicable": True, "weight": 1, "reason": "ok"},
+                    {"id": "safety", "score": None, "applicable": False, "weight": 1, "reason": "n/a"},
+                ]
+            }
+        }
+
+        mock_oi = MagicMock()
+        mock_oi.id = "oi_1"
+        mock_oi.status = "pass"
+        mock_oi.results = [mock_result]
+        mock_oi.sample = None
+        mock_oi.datasource_item = {}
+
+        mock_client = MagicMock()
+        mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi]))
+
+        items = await _fetch_output_items(mock_client, "eval_1", "run_1")
+
+        assert len(items) == 1
+        scores = items[0].scores
+        assert len(scores) == 1
+        assert scores[0].dimensions is not None
+        assert len(scores[0].dimensions) == 2
+        policy = next(d for d in scores[0].dimensions if d.id == "policy")
+        assert policy.score == 4
+        assert policy.applicable is True
+        assert policy.weight == 1
+        assert policy.reason == "ok"
+        safety = next(d for d in scores[0].dimensions if d.id == "safety")
+        assert safety.score is None
+        assert safety.applicable is False
+
+    async def test_no_rubric_scores_when_absent(self) -> None:
+        from agent_framework_foundry._foundry_evals import _fetch_output_items
+
+        mock_result = MagicMock()
+        mock_result.name = "relevance"
+        mock_result.score = 0.85
+        mock_result.passed = True
+        mock_result.sample = None
+
+        mock_oi = MagicMock()
+        mock_oi.id = "oi_2"
+        mock_oi.status = "pass"
+        mock_oi.results = [mock_result]
+        mock_oi.sample = None
+        mock_oi.datasource_item = {}
+
+        mock_client = MagicMock()
+        mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi]))
+
+        items = await _fetch_output_items(mock_client, "eval_1", "run_1")
+
+        assert items[0].scores[0].dimensions is None
+
+
+class TestExtractRubricScores:
+    def test_handles_attribute_style_properties(self) -> None:
+        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
+
+        rs = MagicMock()
+        rs.id = "policy"
+        rs.score = 5
+        rs.applicable = True
+        rs.weight = 2
+        rs.reason = "ok"
+
+        sample = MagicMock()
+        sample.properties = MagicMock()
+        sample.properties.rubric_scores = [rs]
+
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert result[0].id == "policy"
+        assert result[0].score == 5
+        assert result[0].weight == 2
+
+    def test_top_level_rubric_scores_in_dict(self) -> None:
+        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
+
+        sample = {"rubric_scores": [{"id": "a", "score": 3, "applicable": True, "weight": 1, "reason": "r"}]}
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert result[0].id == "a"
+
+    def test_returns_none_when_missing(self) -> None:
+        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
+
+        assert _extract_rubric_scores(None) is None
+        assert _extract_rubric_scores({}) is None
+        assert _extract_rubric_scores({"properties": {}}) is None
+
+    def test_skips_malformed_entries(self) -> None:
+        from agent_framework_foundry._foundry_evals import _extract_rubric_scores
+
+        sample = {
+            "properties": {
+                "rubric_scores": [
+                    {"id": "good", "score": 3, "applicable": True, "weight": 1, "reason": "ok"},
+                    {"id": "bad-no-weight", "score": 2, "applicable": True, "reason": "x"},
+                ]
+            }
+        }
+        result = _extract_rubric_scores(sample)
+        assert result is not None
+        assert len(result) == 1
+        assert result[0].id == "good"
+
 
 # ---------------------------------------------------------------------------
 # _poll_eval_run — timeout / failed / canceled paths
@@ -2758,3 +2994,489 @@ async def test_target_without_type_raises(self) -> None:
                 client=mock_client,
                 model="gpt-4o",
             )
+
+
+class TestFoundryAgentAsEvalSource:
+    """Tests for foundry's agent_as_eval_source helper (wraps BaseAgent.as_eval_source)."""
+
+    def test_returns_prompt_source_with_dossier(self) -> None:
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
+        source = agent_as_eval_source(agent)
+        assert source.type == "prompt"
+        assert source.description == "Looks up the weather."
+        assert source.prompt is not None
+        assert "Agent name: weather-bot" in source.prompt
+        assert "Be brief." in source.prompt
+
+    def test_hosted_agent_name_emits_agent_source(self) -> None:
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
+        source = agent_as_eval_source(agent, hosted_agent_name="weather-bot-hosted-id")
+        assert source.type == "agent"
+        assert source.agent_name == "weather-bot-hosted-id"
+        assert source.prompt is None
+        assert source.description == "Looks up the weather."
+
+    def test_explicit_hosted_agent_version_forwarded(self) -> None:
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot")
+        source = agent_as_eval_source(
+            agent,
+            hosted_agent_name="weather-bot-hosted-id",
+            hosted_agent_version="3",
+        )
+        assert source.type == "agent"
+        assert source.agent_name == "weather-bot-hosted-id"
+        assert source.agent_version == "3"
+
+    def test_auto_detects_hosted_foundry_agent(self) -> None:
+        """A chat_client carrying agent_name/agent_version is treated as a hosted agent."""
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
+        agent.chat_client = MagicMock()
+        agent.chat_client.agent_name = "weather-prompt-agent"
+        agent.chat_client.agent_version = "2"
+
+        source = agent_as_eval_source(agent)
+        assert source.type == "agent"
+        assert source.agent_name == "weather-prompt-agent"
+        assert source.agent_version == "2"
+        assert source.prompt is None
+        assert source.description == "Looks up the weather."
+
+    def test_auto_detection_handles_versionless_hosted_agent(self) -> None:
+        """HostedAgents typically omit agent_version (no None forwarded)."""
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot")
+        agent.chat_client = MagicMock()
+        agent.chat_client.agent_name = "weather-hosted-agent"
+        agent.chat_client.agent_version = None
+
+        source = agent_as_eval_source(agent)
+        assert source.type == "agent"
+        assert source.agent_name == "weather-hosted-agent"
+        assert source.agent_version is None
+
+    def test_force_prompt_source_overrides_auto_detection(self) -> None:
+        """force_prompt_source=True falls back to dossier even for hosted agents."""
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="weather-bot", description="Looks up the weather.")
+        agent.chat_client = MagicMock()
+        agent.chat_client.agent_name = "weather-prompt-agent"
+        agent.chat_client.agent_version = "2"
+
+        source = agent_as_eval_source(agent, force_prompt_source=True)
+        assert source.type == "prompt"
+        assert source.prompt is not None
+        assert "Agent name: weather-bot" in source.prompt
+
+    def test_auto_detection_ignores_non_string_chat_client_fields(self) -> None:
+        """Bare MagicMock chat_client (untyped attrs) must not trigger detection."""
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent(name="local-agent")
+        agent.chat_client = MagicMock()  # agent_name attr resolves to a MagicMock, not a str
+
+        source = agent_as_eval_source(agent)
+        assert source.type == "prompt"
+        assert source.prompt is not None
+        assert "Agent name: local-agent" in source.prompt
+
+    def test_forwards_keyword_options_to_agent(self) -> None:
+        from agent_framework_foundry._foundry_evals import agent_as_eval_source
+
+        agent = _make_stub_agent()
+        source = agent_as_eval_source(agent, include_instructions=False)
+        assert source.prompt is not None
+        assert "Instructions:" not in source.prompt
+
+
+class TestFoundryWorkflowAsEvalSource:
+    """Tests for foundry's workflow_as_eval_source helper (wraps Workflow.as_eval_source)."""
+
+    def _make_workflow(self) -> MagicMock:
+        from agent_framework._evaluation import _render_workflow_dossier
+
+        workflow = MagicMock()
+        workflow.name = "demo-workflow"
+        workflow.description = "Routes user questions."
+        workflow.to_dict.return_value = {
+            "name": "demo-workflow",
+            "id": "wf_1",
+            "executors": {},
+            "edge_groups": [],
+        }
+        workflow.executors = {}
+        workflow.as_eval_source.side_effect = lambda **kw: _render_workflow_dossier(
+            workflow,
+            include_instructions=kw.get("include_instructions", True),
+            include_tools=kw.get("include_tools", True),
+            include_context_providers=kw.get("include_context_providers", False),
+            include_examples=kw.get("include_examples", False),
+            examples=kw.get("examples"),
+            include_topology=kw.get("include_topology", True),
+        )
+        return workflow
+
+    def test_returns_prompt_source_with_topology(self) -> None:
+        from agent_framework_foundry._foundry_evals import workflow_as_eval_source
+
+        workflow = self._make_workflow()
+        source = workflow_as_eval_source(workflow)
+        assert source.type == "prompt"
+        assert source.description == "Routes user questions."
+        assert source.prompt is not None
+        assert "Workflow name: demo-workflow" in source.prompt
+        assert "Topology (JSON):" in source.prompt
+
+    def test_topology_can_be_disabled(self) -> None:
+        from agent_framework_foundry._foundry_evals import workflow_as_eval_source
+
+        workflow = self._make_workflow()
+        source = workflow_as_eval_source(workflow, include_topology=False)
+        assert source.prompt is not None
+        assert "Topology (JSON):" not in source.prompt
+
+
+class TestCoalesceGenerationSources:
+    """Validation for the source-resolution helper used by FoundryEvals.generate_rubric."""
+
+    def test_requires_exactly_one_source(self) -> None:
+        from agent_framework_foundry._foundry_evals import _coalesce_generation_sources
+
+        with pytest.raises(ValueError, match="Provide one of"):
+            _coalesce_generation_sources(agent=None, workflow=None, sources=None)
+
+    def test_rejects_multiple_sources(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _coalesce_generation_sources
+
+        agent = MagicMock()
+        agent.name = "a"
+        agent.description = None
+        agent.default_options = {"instructions": "x", "tools": []}
+        agent.context_providers = []
+        agent.mcp_tools = []
+        with pytest.raises(ValueError, match="only one of"):
+            _coalesce_generation_sources(
+                agent=agent,
+                workflow=None,
+                sources=[EvalGenerationSource(type="prompt", prompt="hi")],
+            )
+
+    def test_uses_agent_helper_when_only_agent_supplied(self) -> None:
+        from agent_framework_foundry._foundry_evals import _coalesce_generation_sources
+
+        agent = _make_stub_agent(name="alpha", description="An agent.")
+
+        sources = _coalesce_generation_sources(agent=agent, workflow=None, sources=None)
+        assert len(sources) == 1
+        assert sources[0].type == "prompt"
+        assert sources[0].prompt is not None
+        assert "Agent name: alpha" in sources[0].prompt
+
+    def test_rejects_empty_sources_list(self) -> None:
+        from agent_framework_foundry._foundry_evals import _coalesce_generation_sources
+
+        with pytest.raises(ValueError, match="at least one"):
+            _coalesce_generation_sources(agent=None, workflow=None, sources=[])
+
+
+class TestToSdkSource:
+    """Translation between EvalGenerationSource and SDK *JobSource types."""
+
+    def _make_sdk_types(self, *, with_agent: bool = True, with_dataset: bool = True, with_traces: bool = True) -> Any:
+        from agent_framework_foundry._foundry_evals import _GenerationSdkTypes
+
+        return _GenerationSdkTypes(
+            EvaluatorGenerationInputs=MagicMock(),
+            EvaluatorGenerationJob=MagicMock(),
+            PromptSource=MagicMock(name="PromptSource"),
+            AgentSource=MagicMock(name="AgentSource") if with_agent else None,
+            DatasetSource=MagicMock(name="DatasetSource") if with_dataset else None,
+            TracesSource=MagicMock(name="TracesSource") if with_traces else None,
+        )
+
+    def test_prompt_source_is_translated(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        sdk.PromptSource.return_value = "prompt-sdk-instance"
+        out = _to_sdk_source(
+            EvalGenerationSource(type="prompt", prompt="hello", description="d"),
+            sdk,
+        )
+        assert out == "prompt-sdk-instance"
+        sdk.PromptSource.assert_called_once_with(prompt="hello", description="d")
+
+    def test_prompt_without_text_raises(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        with pytest.raises(ValueError, match="non-empty prompt"):
+            _to_sdk_source(EvalGenerationSource(type="prompt"), sdk)
+
+    def test_agent_source_is_translated(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        sdk.AgentSource.return_value = "agent-sdk-instance"
+        out = _to_sdk_source(
+            EvalGenerationSource(type="agent", agent_name="my-hosted-agent"),
+            sdk,
+        )
+        assert out == "agent-sdk-instance"
+        sdk.AgentSource.assert_called_once_with(agent_name="my-hosted-agent")
+
+    def test_agent_source_requires_name(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        with pytest.raises(ValueError, match="agent_name"):
+            _to_sdk_source(EvalGenerationSource(type="agent"), sdk)
+
+    def test_agent_source_raises_when_sdk_missing(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types(with_agent=False)
+        with pytest.raises(NotImplementedError, match="AgentEvaluatorGenerationJobSource"):
+            _to_sdk_source(
+                EvalGenerationSource(type="agent", agent_name="x"),
+                sdk,
+            )
+
+    def test_dataset_source_is_translated(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        sdk.DatasetSource.return_value = "dataset-sdk-instance"
+        out = _to_sdk_source(
+            EvalGenerationSource(type="dataset", dataset_name="ds", dataset_version="1"),
+            sdk,
+        )
+        assert out == "dataset-sdk-instance"
+        sdk.DatasetSource.assert_called_once_with(name="ds", version="1")
+
+    def test_agent_source_forwards_agent_version(self) -> None:
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource, _to_sdk_source
+
+        sdk = self._make_sdk_types()
+        sdk.AgentSource.return_value = "agent-sdk-instance"
+        out = _to_sdk_source(
+            EvalGenerationSource(type="agent", agent_name="prompt-agent", agent_version="2"),
+            sdk,
+        )
+        assert out == "agent-sdk-instance"
+        sdk.AgentSource.assert_called_once_with(agent_name="prompt-agent", agent_version="2")
+
+
+class TestPollGenerationJob:
+    """Behavior of the rubric-generation polling loop."""
+
+    async def test_returns_immediately_on_succeeded(self) -> None:
+        from agent_framework_foundry._foundry_evals import _poll_generation_job
+
+        evaluators_ops = MagicMock()
+        evaluators_ops.get_generation_job = AsyncMock()
+        job = MagicMock(id="job_1", status="succeeded")
+        out = await _poll_generation_job(evaluators_ops, job, poll_interval=0.01, timeout=1.0)
+        assert out is job
+        evaluators_ops.get_generation_job.assert_not_called()
+
+    async def test_polls_until_terminal(self) -> None:
+        from agent_framework_foundry._foundry_evals import _poll_generation_job
+
+        running = MagicMock(id="job_1", status="running")
+        succeeded = MagicMock(id="job_1", status="succeeded")
+        evaluators_ops = MagicMock()
+        evaluators_ops.get_generation_job = AsyncMock(side_effect=[running, succeeded])
+
+        initial = MagicMock(id="job_1", status="running")
+        out = await _poll_generation_job(evaluators_ops, initial, poll_interval=0.001, timeout=1.0)
+        assert out is succeeded
+        assert evaluators_ops.get_generation_job.await_count == 2
+
+    async def test_failed_status_raises(self) -> None:
+        from agent_framework_foundry._foundry_evals import _poll_generation_job
+
+        err = MagicMock(message="boom")
+        terminal = MagicMock(id="job_1", status="failed", error=err)
+        evaluators_ops = MagicMock()
+        evaluators_ops.get_generation_job = AsyncMock(return_value=terminal)
+
+        with pytest.raises(RuntimeError, match="boom"):
+            await _poll_generation_job(
+                evaluators_ops,
+                MagicMock(id="job_1", status="running"),
+                poll_interval=0.001,
+                timeout=1.0,
+            )
+
+    async def test_timeout_raises(self) -> None:
+        from agent_framework_foundry._foundry_evals import _poll_generation_job
+
+        running = MagicMock(id="job_1", status="running")
+        evaluators_ops = MagicMock()
+        evaluators_ops.get_generation_job = AsyncMock(return_value=running)
+
+        with pytest.raises(TimeoutError):
+            await _poll_generation_job(evaluators_ops, running, poll_interval=0.001, timeout=0.005)
+
+
+class TestGenerationJobToRef:
+    """Translation of a completed generation job to a GeneratedEvaluatorRef."""
+
+    def test_builds_pinned_ref_with_dimensions(self) -> None:
+        from agent_framework_foundry._foundry_evals import RubricDimension, _generation_job_to_ref
+
+        dim = MagicMock(id="d1", description="dim", weight=2, always_applicable=True)
+        definition = MagicMock(dimensions=[dim], pass_threshold=0.75)
+        evaluator = MagicMock(
+            name="my-eval",
+            version=3,
+            display_name="My Eval",
+            description="A custom rubric.",
+            definition=definition,
+        )
+        evaluator.name = "my-eval"
+        job = MagicMock(artifacts=MagicMock(evaluator=evaluator))
+
+        ref = _generation_job_to_ref(job, category="quality")
+        assert ref.name == "my-eval"
+        assert ref.version == "3"
+        assert ref.display_name == "My Eval"
+        assert ref.description == "A custom rubric."
+        assert ref.category == "quality"
+        assert ref.pass_threshold == 0.75
+        assert ref.dimensions is not None
+        assert ref.dimensions[0] == RubricDimension(id="d1", description="dim", weight=2, always_applicable=True)
+
+    def test_missing_artifacts_raises(self) -> None:
+        from agent_framework_foundry._foundry_evals import _generation_job_to_ref
+
+        job = MagicMock(artifacts=None)
+        with pytest.raises(RuntimeError, match="evaluator artifact"):
+            _generation_job_to_ref(job, category="quality")
+
+
+class TestGenerateRubricSdkMissing:
+    """generate_rubric raises NotImplementedError when SDK lacks the rubric APIs."""
+
+    async def test_raises_when_sdk_types_unavailable(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from agent_framework_foundry import _foundry_evals as fm
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource
+
+        def _raise() -> Any:
+            raise fm._RubricSdkUnavailableError(fm._RUBRIC_SDK_MISSING_MSG)
+
+        monkeypatch.setattr(fm, "_import_generation_sdk_types", _raise)
+
+        project_client = MagicMock()
+
+        with pytest.raises(NotImplementedError, match="rubric"):
+            await FoundryEvals.generate_rubric(
+                project_client=project_client,
+                name="my-eval",
+                sources=[EvalGenerationSource(type="prompt", prompt="hi")],
+            )
+
+    async def test_raises_value_error_on_invalid_category(self) -> None:
+        """category outside {quality, safety} should fail fast at the boundary."""
+        from agent_framework_foundry._foundry_evals import EvalGenerationSource
+
+        project_client = MagicMock()
+
+        with pytest.raises(ValueError, match="category"):
+            await FoundryEvals.generate_rubric(
+                project_client=project_client,
+                name="my-eval",
+                sources=[EvalGenerationSource(type="prompt", prompt="hi")],
+                category=cast("Any", "invalid"),
+            )
+
+
+class TestGenerateRubricE2E:
+    """End-to-end happy path for generate_rubric with mocked SDK."""
+
+    async def test_generate_rubric_from_agent(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        from agent_framework_foundry import _foundry_evals as fm
+
+        # Stub SDK type handles
+        prompt_cls = MagicMock(name="PromptSource")
+        prompt_cls.return_value = "sdk-prompt"
+        inputs_cls = MagicMock(name="EvaluatorGenerationInputs")
+        inputs_cls.return_value = "sdk-inputs"
+        job_cls = MagicMock(name="EvaluatorGenerationJob")
+        job_cls.return_value = "sdk-job"
+
+        sdk_types = fm._GenerationSdkTypes(
+            EvaluatorGenerationInputs=inputs_cls,
+            EvaluatorGenerationJob=job_cls,
+            PromptSource=prompt_cls,
+            AgentSource=None,
+            DatasetSource=None,
+            TracesSource=None,
+        )
+        monkeypatch.setattr(fm, "_import_generation_sdk_types", lambda: sdk_types)
+
+        # Mock the SDK operations and completed job
+        completed_evaluator = MagicMock(version="7", display_name=None, description=None)
+        completed_evaluator.name = "agent-rubric"
+        completed_evaluator.definition = MagicMock(dimensions=[], pass_threshold=None)
+        completed = MagicMock(
+            id="job_42",
+            status="succeeded",
+            artifacts=MagicMock(evaluator=completed_evaluator),
+        )
+
+        evaluators_ops = MagicMock()
+        evaluators_ops.create_generation_job = AsyncMock(return_value=completed)
+        evaluators_ops.get_generation_job = AsyncMock(return_value=completed)
+        project_client = MagicMock()
+        project_client.beta = MagicMock(evaluators=evaluators_ops)
+
+        # Build a stub agent
+        agent = _make_stub_agent(
+            name="weather-bot",
+            description="Looks up weather.",
+            instructions="Be brief.",
+        )
+
+        ref = await FoundryEvals.generate_rubric(
+            project_client=project_client,
+            name="agent-rubric",
+            agent=agent,
+            category="quality",
+            model="gpt-4o",
+            display_name="Display",
+            description="Desc",
+            operation_id="op-123",
+        )
+
+        assert ref.name == "agent-rubric"
+        assert ref.version == "7"
+        assert ref.category == "quality"
+
+        # Verify inputs/job/source assembly
+        prompt_cls.assert_called_once()
+        prompt_kwargs = prompt_cls.call_args.kwargs
+        assert "Agent name: weather-bot" in prompt_kwargs["prompt"]
+        assert prompt_kwargs["description"] == "Looks up weather."
+
+        inputs_cls.assert_called_once()
+        inputs_kwargs = inputs_cls.call_args.kwargs
+        assert inputs_kwargs["name"] == "agent-rubric"
+        assert inputs_kwargs["category"] == "quality"
+        assert inputs_kwargs["model"] == "gpt-4o"
+        assert inputs_kwargs["display_name"] == "Display"
+        assert inputs_kwargs["description"] == "Desc"
+        assert inputs_kwargs["sources"] == ["sdk-prompt"]
+
+        job_cls.assert_called_once_with(inputs="sdk-inputs")
+        evaluators_ops.create_generation_job.assert_awaited_once_with(job="sdk-job", operation_id="op-123")
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py
new file mode 100644
index 0000000000..9c19ff552b
--- /dev/null
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_with_generated_rubric_sample.py
@@ -0,0 +1,151 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Generate a Foundry rubric evaluator from an agent and use it in CI.
+
+This sample demonstrates the end-to-end adaptive-evals flow:
+
+1. Build an agent.
+2. Generate a rubric evaluator from the agent using
+   ``FoundryEvals.generate_rubric()`` — produces a pinned
+   ``GeneratedEvaluatorRef`` you can store in source control.
+3. Use the pinned reference in ``evaluators=[...]`` for a regression
+   run alongside built-in evaluators.
+4. Assert quality gates with ``assert_score_at_least`` /
+   ``assert_dimension_score_at_least`` / ``assert_no_failed_items``.
+
+A companion ``evaluators.yaml`` shows the source-controlled config
+pattern for CI.  Load it with :func:`load_evals_config` and pass the
+resulting spec through :func:`build_sources` to keep generation
+parameters out of code.
+
+Prerequisites:
+- An Azure AI Foundry project with a deployed model.
+- ``azure-ai-projects`` build that includes the rubric-generation APIs.
+- Set ``FOUNDRY_PROJECT_ENDPOINT`` and ``FOUNDRY_MODEL`` in ``.env``.
+
+Run with:
+
+.. code-block:: bash
+
+    az login
+    python evaluate_with_generated_rubric_sample.py
+"""
+
+import asyncio
+import os
+import textwrap
+from pathlib import Path
+
+from agent_framework import evaluate_agent
+from agent_framework.foundry import (
+    FoundryChatClient,
+    FoundryEvals,
+    build_sources,
+    load_evals_config,
+)
+from azure.ai.projects.aio import AIProjectClient
+from azure.identity.aio import AzureCliCredential
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+def get_weather(location: str) -> str:
+    """Get the current weather for a location."""
+    samples = {
+        "seattle": "62F, cloudy with a chance of rain",
+        "london": "55F, overcast",
+        "paris": "68F, partly sunny",
+    }
+    return samples.get(location.lower(), f"Weather data not available for {location}")
+
+
+SAMPLE_YAML = textwrap.dedent(
+    """\
+    evaluators:
+      travel-quality:
+        type: foundry.generated_rubric
+        category: quality
+        model: gpt-4o
+        display_name: Travel Quality Rubric
+        description: Custom rubric tailored to the travel-assistant agent.
+        sources:
+          - type: agent
+            include_instructions: true
+            include_tools: true
+    """
+)
+
+
+async def main() -> None:
+    project_endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
+    model_name = os.environ.get("FOUNDRY_MODEL", "gpt-4o")
+
+    credential = AzureCliCredential()
+    chat_client = FoundryChatClient(
+        project_endpoint=project_endpoint,
+        model=model_name,
+        credential=credential,
+    )
+    project_client = AIProjectClient(endpoint=project_endpoint, credential=credential)
+
+    agent = chat_client.as_agent(
+        name="travel-assistant",
+        instructions=(
+            "You are a helpful travel assistant.  Always ground recommendations in tool output, "
+            "cite each tool result, and refuse questions outside travel planning."
+        ),
+        tools=[get_weather],
+    )
+
+    # 1. Load the source-controlled evaluator config.
+    config_path = Path(__file__).with_name("evaluators.yaml")
+    if not config_path.exists():
+        config_path.write_text(SAMPLE_YAML, encoding="utf-8")
+        print(f"Wrote sample config to {config_path}")
+    config = load_evals_config(config_path)
+    spec = config["travel-quality"]
+
+    # 2. Generate (or refresh) the rubric evaluator.  In CI you typically run
+    # this once and commit the returned name/version pair.
+    print("Generating rubric evaluator from agent + spec...")
+    sources = build_sources(spec, agent=agent)
+    rubric_ref = await FoundryEvals.generate_rubric(
+        project_client=project_client,
+        name=spec.name,
+        sources=sources,
+        category=spec.category,
+        model=spec.model,
+        display_name=spec.display_name,
+        description=spec.description,
+    )
+    print(f"Generated rubric {rubric_ref.name}@{rubric_ref.version} with {len(rubric_ref.dimensions or ())} dimensions")
+
+    # 3. Run an evaluation that combines built-ins with the new rubric.
+    evals = FoundryEvals(
+        client=chat_client,
+        evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY, rubric_ref],
+    )
+    results = await evaluate_agent(
+        agent=agent,
+        queries=[
+            "What's the weather in Seattle?",
+            "Should I pack an umbrella for London?",
+        ],
+        evaluators=evals,
+    )
+
+    # 4. Quality gates — wire these into your CI job's exit status.
+    for r in results:
+        print(f"\nRun {r.run_id}: {r.passed}/{r.total} passed; portal: {r.report_url}")
+        r.assert_no_failed_items()
+        r.assert_score_at_least(0.8)
+        if rubric_ref.dimensions:
+            r.assert_dimension_score_at_least(rubric_ref.dimensions[0].id, 3)
+
+    await project_client.close()
+    await credential.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml
new file mode 100644
index 0000000000..f3e698c77c
--- /dev/null
+++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluators.yaml
@@ -0,0 +1,11 @@
+evaluators:
+  travel-quality:
+    type: foundry.generated_rubric
+    category: quality
+    model: gpt-4o
+    display_name: Travel Quality Rubric
+    description: Custom rubric tailored to the travel-assistant agent.
+    sources:
+      - type: agent
+        include_instructions: true
+        include_tools: true
diff --git a/python/uv.lock b/python/uv.lock
index 58c0ed50ee..dee89c9f0a 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -604,7 +604,7 @@ dependencies = [
 [package.metadata]
 requires-dist = [
     { name = "agent-framework-core", editable = "packages/core" },
-    { name = "github-copilot-sdk", marker = "python_full_version >= '3.11'", specifier = "<=1.0.0b2,>=1.0.0b2" },
+    { name = "github-copilot-sdk", marker = "python_full_version >= '3.11'", specifier = ">=1.0.0b2,<=1.0.0b2" },
 ]
 
 [[package]]