feat: Add MultiTurn Task success metric

ankursharmas · copybara-github · commit 9a75c06873b7 · 2026-03-16T10:09:05.000-07:00
The metric takes into account all the turns of the multi-turn conversation.

  The class delegates the responsibility to Vertex Gen AI Eval SDK. The V1
  suffix in the class name is added to convey that there could be other versions
  of the safety metric as well, and those metrics could use a different strategy
  to evaluate safety.

Co-authored-by: Ankur Sharma &lt;ankusharma@google.com&gt;
PiperOrigin-RevId: 884504910
diff --git a/pyproject.toml b/pyproject.toml
@@ -111,7 +111,7 @@ eval = [
   # go/keep-sorted start
   "Jinja2>=3.1.4,<4.0.0",                         # For eval template rendering
   "gepa>=0.1.0",
-  "google-cloud-aiplatform[evaluation]>=1.100.0",
+  "google-cloud-aiplatform[evaluation]>=1.140.0",
   "pandas>=2.2.3",
   "rouge-score>=0.1.2",
   "tabulate>=0.9.0",
diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py
@@ -61,6 +61,8 @@ class PrebuiltMetrics(Enum):
 
   PER_TURN_USER_SIMULATOR_QUALITY_V1 = "per_turn_user_simulator_quality_v1"
 
+  MULTI_TURN_TASK_SUCCESS_V1 = "multi_turn_task_success_v1"
+
 
 MetricName: TypeAlias = Union[str, PrebuiltMetrics]
 Threshold: TypeAlias = float
diff --git a/src/google/adk/evaluation/metric_evaluator_registry.py b/src/google/adk/evaluation/metric_evaluator_registry.py
@@ -27,12 +27,14 @@
 from .hallucinations_v1 import HallucinationsV1Evaluator
 from .metric_info_providers import FinalResponseMatchV2EvaluatorMetricInfoProvider
 from .metric_info_providers import HallucinationsV1EvaluatorMetricInfoProvider
+from .metric_info_providers import MultiTurnTaskSuccessV1MetricInfoProvider
 from .metric_info_providers import PerTurnUserSimulatorQualityV1MetricInfoProvider
 from .metric_info_providers import ResponseEvaluatorMetricInfoProvider
 from .metric_info_providers import RubricBasedFinalResponseQualityV1EvaluatorMetricInfoProvider
 from .metric_info_providers import RubricBasedToolUseV1EvaluatorMetricInfoProvider
 from .metric_info_providers import SafetyEvaluatorV1MetricInfoProvider
 from .metric_info_providers import TrajectoryEvaluatorMetricInfoProvider
+from .multi_turn_task_success_evaluator import MultiTurnTaskSuccessV1Evaluator
 from .response_evaluator import ResponseEvaluator
 from .rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator
 from .rubric_based_tool_use_quality_v1 import RubricBasedToolUseV1Evaluator
@@ -126,6 +128,10 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
       metric_info=SafetyEvaluatorV1MetricInfoProvider().get_metric_info(),
       evaluator=SafetyEvaluatorV1,
   )
+  metric_evaluator_registry.register_evaluator(
+      metric_info=MultiTurnTaskSuccessV1MetricInfoProvider().get_metric_info(),
+      evaluator=MultiTurnTaskSuccessV1Evaluator,
+  )
   metric_evaluator_registry.register_evaluator(
       metric_info=FinalResponseMatchV2EvaluatorMetricInfoProvider().get_metric_info(),
       evaluator=FinalResponseMatchV2Evaluator,
diff --git a/src/google/adk/evaluation/metric_info_providers.py b/src/google/adk/evaluation/metric_info_providers.py
@@ -94,6 +94,24 @@ def get_metric_info(self) -> MetricInfo:
     )
 
 
+class MultiTurnTaskSuccessV1MetricInfoProvider(MetricInfoProvider):
+  """Metric info provider for MultiTurnTaskSuccessV1."""
+
+  def get_metric_info(self) -> MetricInfo:
+    return MetricInfo(
+        metric_name=PrebuiltMetrics.MULTI_TURN_TASK_SUCCESS_V1.value,
+        description=(
+            "Evaluates if the agent was able to achieve the goal or goals of"
+            " the conversation."
+            " Value range of the metric is [0, 1], with values closer"
+            " to 1 to be more desirable (safe)."
+        ),
+        metric_value_info=MetricValueInfo(
+            interval=Interval(min_value=0.0, max_value=1.0)
+        ),
+    )
+
+
 class FinalResponseMatchV2EvaluatorMetricInfoProvider(MetricInfoProvider):
   """Metric info provider for FinalResponseMatchV2Evaluator."""
 
diff --git a/src/google/adk/evaluation/multi_turn_task_success_evaluator.py b/src/google/adk/evaluation/multi_turn_task_success_evaluator.py
@@ -0,0 +1,63 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Optional
+
+from typing_extensions import override
+
+from .eval_case import ConversationScenario
+from .eval_case import Invocation
+from .eval_metrics import EvalMetric
+from .evaluator import EvaluationResult
+from .evaluator import Evaluator
+from .vertex_ai_eval_facade import _MultiTurnVertexiAiEvalFacade
+
+
+class MultiTurnTaskSuccessV1Evaluator(Evaluator):
+  """Evaluates if the agent was able to achieve the goal or goals of the conversation.
+
+  The metric takes into account all the turns of the multi-turn conversation.
+
+  The class delegates the responsibility to Vertex Gen AI Eval SDK. The V1
+  suffix in the class name is added to convey that there could be other versions
+  of the safety metric as well, and those metrics could use a different strategy
+  to evaluate safety.
+
+  Using this class requires a GCP project. Please set GOOGLE_CLOUD_PROJECT and
+  GOOGLE_CLOUD_LOCATION in your .env file.
+
+  Value range of the metric is [0, 1], with values closer to 1 to be more
+  desirable (safe).
+  """
+
+  def __init__(self, eval_metric: EvalMetric):
+    self._eval_metric = eval_metric
+
+  @override
+  def evaluate_invocations(
+      self,
+      actual_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]] = None,
+      conversation_scenario: Optional[ConversationScenario] = None,
+  ) -> EvaluationResult:
+    from ..dependencies.vertexai import vertexai
+
+    return _MultiTurnVertexiAiEvalFacade(
+        threshold=self._eval_metric.threshold,
+        metric_name=vertexai.types.RubricMetric.MULTI_TURN_TASK_SUCCESS,
+    ).evaluate_invocations(
+        actual_invocations, expected_invocations, conversation_scenario
+    )
diff --git a/src/google/adk/evaluation/vertex_ai_eval_facade.py b/src/google/adk/evaluation/vertex_ai_eval_facade.py
@@ -15,22 +15,28 @@
 from __future__ import annotations
 
 import abc
+import logging
 import math
 import os
 from typing import Optional
+from typing import Union
 
 from google.genai import types as genai_types
 import pandas as pd
 from typing_extensions import override
 
 from ..dependencies.vertexai import vertexai
+from .app_details import AgentDetails
 from .eval_case import ConversationScenario
 from .eval_case import Invocation
+from .eval_case import InvocationEvent
 from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .evaluator import PerInvocationResult
 
+logger = logging.getLogger("google_adk." + __name__)
+
 _ERROR_MESSAGE_SUFFIX = """
 You should specify both project id and location. This metric uses Vertex Gen AI
 Eval SDK, and it requires google cloud credentials.
@@ -56,7 +62,9 @@ class _VertexAiEvalFacade(Evaluator):
   def __init__(
       self,
       threshold: float,
-      metric_name: vertexai.types.PrebuiltMetric,
+      metric_name: Union[
+          vertexai.types.PrebuiltMetric, vertexai.types.RubricMetric
+      ],
       expected_invocations_required=False,
   ):
     self._threshold = threshold
@@ -119,7 +127,7 @@ def _get_score(self, eval_result) -> Optional[float]:
     return None
 
   def _get_eval_status(self, score: Optional[float]):
-    if score:
+    if score is not None:
       return (
           EvalStatus.PASSED if score >= self._threshold else EvalStatus.FAILED
       )
@@ -188,7 +196,7 @@ def evaluate_invocations(
           )
       )
 
-      if score:
+      if score is not None:
         total_score += score
         num_invocations += 1
 
@@ -203,3 +211,158 @@ def evaluate_invocations(
       )
 
     return EvaluationResult()
+
+
+class _MultiTurnVertexiAiEvalFacade(_VertexAiEvalFacade):
+  """A facade for multi turn metrics exposed in Vertex Gen AI Eval SDK."""
+
+  @override
+  def evaluate_invocations(
+      self,
+      actual_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]] = None,
+      conversation_scenario: Optional[ConversationScenario] = None,
+  ) -> EvaluationResult:
+    del conversation_scenario
+
+    per_invocation_results = []
+    # If expected_invocation are not required by the metric and if they are not
+    # supplied, we provide a list of None.
+    expected_invocations = (
+        [None] * len(actual_invocations)
+        if expected_invocations is None
+        else expected_invocations
+    )
+
+    # We mark all the n-1 turns as NOT-EVALUATED for these metrics.
+    for actual, expected in zip(
+        actual_invocations[:-1], expected_invocations[:-1]
+    ):
+      per_invocation_results.append(
+          PerInvocationResult(
+              actual_invocation=actual,
+              expected_invocation=expected,
+              score=None,
+              eval_status=self._get_eval_status(None),
+          )
+      )
+
+    # Only evaluate the last turn and take into account all the previous turns.
+    eval_case = vertexai.types.EvalCase(
+        agent_data=_MultiTurnVertexiAiEvalFacade._get_agent_data(
+            actual_invocations
+        )
+    )
+    dataset = vertexai.types.EvaluationDataset(eval_cases=[eval_case])
+
+    eval_case_result = self._perform_eval(
+        dataset=dataset, metrics=[self._metric_name]
+    )
+
+    score = self._get_score(eval_case_result)
+    per_invocation_results.append(
+        PerInvocationResult(
+            actual_invocation=actual_invocations[-1],
+            expected_invocation=expected_invocations[-1],
+            score=score,
+            eval_status=self._get_eval_status(score),
+        )
+    )
+
+    if score is not None:
+      return EvaluationResult(
+          overall_score=score,
+          overall_eval_status=self._get_eval_status(score),
+          per_invocation_results=per_invocation_results,
+      )
+
+    return EvaluationResult()
+
+  @staticmethod
+  def _get_agent_data(
+      actual_invocations: list[Invocation],
+  ) -> vertexai.types.evals.AgentData:
+    return vertexai.types.evals.AgentData(
+        agents=_MultiTurnVertexiAiEvalFacade._get_agent_details(
+            actual_invocations
+        ),
+        turns=_MultiTurnVertexiAiEvalFacade._get_turns(actual_invocations),
+    )
+
+  @staticmethod
+  def _get_turns(
+      actual_invocations: list[Invocation],
+  ) -> list[vertexai.types.evals.ConversationTurn]:
+    return [
+        _MultiTurnVertexiAiEvalFacade._map_invocation_turn(index, invocation)
+        for index, invocation in enumerate(actual_invocations)
+    ]
+
+  @staticmethod
+  def _map_invocation_turn(
+      turn_index: int,
+      invocation: Invocation,
+  ) -> vertexai.types.evals.ConversationTurn:
+    agent_events = []
+    agent_events.append(
+        vertexai.types.evals.AgentEvent(
+            author="user", content=invocation.user_content
+        )
+    )
+
+    for invocation_event in invocation.intermediate_data.invocation_events:
+      agent_events.append(
+          _MultiTurnVertexiAiEvalFacade._map_inovcation_event_to_agent_event(
+              invocation_event
+          )
+      )
+
+    agent_events.append(
+        vertexai.types.evals.AgentEvent(
+            author="agent", content=invocation.final_response
+        )
+    )
+
+    return vertexai.types.evals.ConversationTurn(
+        turn_index=turn_index,
+        events=agent_events,
+        turn_id=invocation.invocation_id,
+    )
+
+  @staticmethod
+  def _map_inovcation_event_to_agent_event(
+      invocation_event: InvocationEvent,
+  ) -> vertexai.types.evals.AgentEvent:
+    return vertexai.types.evals.AgentEvent(
+        author=invocation_event.author, content=invocation_event.content
+    )
+
+  @staticmethod
+  def _get_agent_details(
+      actual_invocations: list[Invocation],
+  ) -> dict[str, vertexai.types.evals.AgentConfig]:
+    agent_configs = {}
+    for invocation in actual_invocations:
+      if invocation.app_details and invocation.app_details.agent_details:
+        for (
+            agent_name,
+            agent_details,
+        ) in invocation.app_details.agent_details.items():
+          if agent_name not in agent_configs:
+            agent_configs[agent_name] = (
+                _MultiTurnVertexiAiEvalFacade._map_agent_details_to_agent_config(
+                    agent_details
+                )
+            )
+
+    return agent_configs
+
+  @staticmethod
+  def _map_agent_details_to_agent_config(
+      agent_details: AgentDetails,
+  ) -> vertexai.types.evals.AgentConfig:
+    return vertexai.types.evals.AgentConfig(
+        agent_id=agent_details.name,
+        instruction=agent_details.instructions,
+        tools=agent_details.tool_declarations,
+    )
diff --git a/tests/unittests/evaluation/test_multi_turn_task_success_evaluator.py b/tests/unittests/evaluation/test_multi_turn_task_success_evaluator.py
diff --git a/tests/unittests/evaluation/test_vertex_ai_eval_facade.py b/tests/unittests/evaluation/test_vertex_ai_eval_facade.py