Skip to content

Commit 9a75c06

Browse files
ankursharmascopybara-github
authored andcommitted
feat: Add MultiTurn Task success metric
The metric takes into account all the turns of the multi-turn conversation. The class delegates the responsibility to Vertex Gen AI Eval SDK. The V1 suffix in the class name is added to convey that there could be other versions of the safety metric as well, and those metrics could use a different strategy to evaluate safety. Co-authored-by: Ankur Sharma <ankusharma@google.com> PiperOrigin-RevId: 884504910
1 parent 360e0f7 commit 9a75c06

File tree

8 files changed

+627
-4
lines changed

8 files changed

+627
-4
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ eval = [
111111
# go/keep-sorted start
112112
"Jinja2>=3.1.4,<4.0.0", # For eval template rendering
113113
"gepa>=0.1.0",
114-
"google-cloud-aiplatform[evaluation]>=1.100.0",
114+
"google-cloud-aiplatform[evaluation]>=1.140.0",
115115
"pandas>=2.2.3",
116116
"rouge-score>=0.1.2",
117117
"tabulate>=0.9.0",

src/google/adk/evaluation/eval_metrics.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ class PrebuiltMetrics(Enum):
6161

6262
PER_TURN_USER_SIMULATOR_QUALITY_V1 = "per_turn_user_simulator_quality_v1"
6363

64+
MULTI_TURN_TASK_SUCCESS_V1 = "multi_turn_task_success_v1"
65+
6466

6567
MetricName: TypeAlias = Union[str, PrebuiltMetrics]
6668
Threshold: TypeAlias = float

src/google/adk/evaluation/metric_evaluator_registry.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,14 @@
2727
from .hallucinations_v1 import HallucinationsV1Evaluator
2828
from .metric_info_providers import FinalResponseMatchV2EvaluatorMetricInfoProvider
2929
from .metric_info_providers import HallucinationsV1EvaluatorMetricInfoProvider
30+
from .metric_info_providers import MultiTurnTaskSuccessV1MetricInfoProvider
3031
from .metric_info_providers import PerTurnUserSimulatorQualityV1MetricInfoProvider
3132
from .metric_info_providers import ResponseEvaluatorMetricInfoProvider
3233
from .metric_info_providers import RubricBasedFinalResponseQualityV1EvaluatorMetricInfoProvider
3334
from .metric_info_providers import RubricBasedToolUseV1EvaluatorMetricInfoProvider
3435
from .metric_info_providers import SafetyEvaluatorV1MetricInfoProvider
3536
from .metric_info_providers import TrajectoryEvaluatorMetricInfoProvider
37+
from .multi_turn_task_success_evaluator import MultiTurnTaskSuccessV1Evaluator
3638
from .response_evaluator import ResponseEvaluator
3739
from .rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator
3840
from .rubric_based_tool_use_quality_v1 import RubricBasedToolUseV1Evaluator
@@ -126,6 +128,10 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
126128
metric_info=SafetyEvaluatorV1MetricInfoProvider().get_metric_info(),
127129
evaluator=SafetyEvaluatorV1,
128130
)
131+
metric_evaluator_registry.register_evaluator(
132+
metric_info=MultiTurnTaskSuccessV1MetricInfoProvider().get_metric_info(),
133+
evaluator=MultiTurnTaskSuccessV1Evaluator,
134+
)
129135
metric_evaluator_registry.register_evaluator(
130136
metric_info=FinalResponseMatchV2EvaluatorMetricInfoProvider().get_metric_info(),
131137
evaluator=FinalResponseMatchV2Evaluator,

src/google/adk/evaluation/metric_info_providers.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,24 @@ def get_metric_info(self) -> MetricInfo:
9494
)
9595

9696

97+
class MultiTurnTaskSuccessV1MetricInfoProvider(MetricInfoProvider):
98+
"""Metric info provider for MultiTurnTaskSuccessV1."""
99+
100+
def get_metric_info(self) -> MetricInfo:
101+
return MetricInfo(
102+
metric_name=PrebuiltMetrics.MULTI_TURN_TASK_SUCCESS_V1.value,
103+
description=(
104+
"Evaluates if the agent was able to achieve the goal or goals of"
105+
" the conversation."
106+
" Value range of the metric is [0, 1], with values closer"
107+
" to 1 to be more desirable (safe)."
108+
),
109+
metric_value_info=MetricValueInfo(
110+
interval=Interval(min_value=0.0, max_value=1.0)
111+
),
112+
)
113+
114+
97115
class FinalResponseMatchV2EvaluatorMetricInfoProvider(MetricInfoProvider):
98116
"""Metric info provider for FinalResponseMatchV2Evaluator."""
99117

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
from typing import Optional
18+
19+
from typing_extensions import override
20+
21+
from .eval_case import ConversationScenario
22+
from .eval_case import Invocation
23+
from .eval_metrics import EvalMetric
24+
from .evaluator import EvaluationResult
25+
from .evaluator import Evaluator
26+
from .vertex_ai_eval_facade import _MultiTurnVertexiAiEvalFacade
27+
28+
29+
class MultiTurnTaskSuccessV1Evaluator(Evaluator):
30+
"""Evaluates if the agent was able to achieve the goal or goals of the conversation.
31+
32+
The metric takes into account all the turns of the multi-turn conversation.
33+
34+
The class delegates the responsibility to Vertex Gen AI Eval SDK. The V1
35+
suffix in the class name is added to convey that there could be other versions
36+
of the safety metric as well, and those metrics could use a different strategy
37+
to evaluate safety.
38+
39+
Using this class requires a GCP project. Please set GOOGLE_CLOUD_PROJECT and
40+
GOOGLE_CLOUD_LOCATION in your .env file.
41+
42+
Value range of the metric is [0, 1], with values closer to 1 to be more
43+
desirable (safe).
44+
"""
45+
46+
def __init__(self, eval_metric: EvalMetric):
47+
self._eval_metric = eval_metric
48+
49+
@override
50+
def evaluate_invocations(
51+
self,
52+
actual_invocations: list[Invocation],
53+
expected_invocations: Optional[list[Invocation]] = None,
54+
conversation_scenario: Optional[ConversationScenario] = None,
55+
) -> EvaluationResult:
56+
from ..dependencies.vertexai import vertexai
57+
58+
return _MultiTurnVertexiAiEvalFacade(
59+
threshold=self._eval_metric.threshold,
60+
metric_name=vertexai.types.RubricMetric.MULTI_TURN_TASK_SUCCESS,
61+
).evaluate_invocations(
62+
actual_invocations, expected_invocations, conversation_scenario
63+
)

src/google/adk/evaluation/vertex_ai_eval_facade.py

Lines changed: 166 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,28 @@
1515
from __future__ import annotations
1616

1717
import abc
18+
import logging
1819
import math
1920
import os
2021
from typing import Optional
22+
from typing import Union
2123

2224
from google.genai import types as genai_types
2325
import pandas as pd
2426
from typing_extensions import override
2527

2628
from ..dependencies.vertexai import vertexai
29+
from .app_details import AgentDetails
2730
from .eval_case import ConversationScenario
2831
from .eval_case import Invocation
32+
from .eval_case import InvocationEvent
2933
from .evaluator import EvalStatus
3034
from .evaluator import EvaluationResult
3135
from .evaluator import Evaluator
3236
from .evaluator import PerInvocationResult
3337

38+
logger = logging.getLogger("google_adk." + __name__)
39+
3440
_ERROR_MESSAGE_SUFFIX = """
3541
You should specify both project id and location. This metric uses Vertex Gen AI
3642
Eval SDK, and it requires google cloud credentials.
@@ -56,7 +62,9 @@ class _VertexAiEvalFacade(Evaluator):
5662
def __init__(
5763
self,
5864
threshold: float,
59-
metric_name: vertexai.types.PrebuiltMetric,
65+
metric_name: Union[
66+
vertexai.types.PrebuiltMetric, vertexai.types.RubricMetric
67+
],
6068
expected_invocations_required=False,
6169
):
6270
self._threshold = threshold
@@ -119,7 +127,7 @@ def _get_score(self, eval_result) -> Optional[float]:
119127
return None
120128

121129
def _get_eval_status(self, score: Optional[float]):
122-
if score:
130+
if score is not None:
123131
return (
124132
EvalStatus.PASSED if score >= self._threshold else EvalStatus.FAILED
125133
)
@@ -188,7 +196,7 @@ def evaluate_invocations(
188196
)
189197
)
190198

191-
if score:
199+
if score is not None:
192200
total_score += score
193201
num_invocations += 1
194202

@@ -203,3 +211,158 @@ def evaluate_invocations(
203211
)
204212

205213
return EvaluationResult()
214+
215+
216+
class _MultiTurnVertexiAiEvalFacade(_VertexAiEvalFacade):
217+
"""A facade for multi turn metrics exposed in Vertex Gen AI Eval SDK."""
218+
219+
@override
220+
def evaluate_invocations(
221+
self,
222+
actual_invocations: list[Invocation],
223+
expected_invocations: Optional[list[Invocation]] = None,
224+
conversation_scenario: Optional[ConversationScenario] = None,
225+
) -> EvaluationResult:
226+
del conversation_scenario
227+
228+
per_invocation_results = []
229+
# If expected_invocation are not required by the metric and if they are not
230+
# supplied, we provide a list of None.
231+
expected_invocations = (
232+
[None] * len(actual_invocations)
233+
if expected_invocations is None
234+
else expected_invocations
235+
)
236+
237+
# We mark all the n-1 turns as NOT-EVALUATED for these metrics.
238+
for actual, expected in zip(
239+
actual_invocations[:-1], expected_invocations[:-1]
240+
):
241+
per_invocation_results.append(
242+
PerInvocationResult(
243+
actual_invocation=actual,
244+
expected_invocation=expected,
245+
score=None,
246+
eval_status=self._get_eval_status(None),
247+
)
248+
)
249+
250+
# Only evaluate the last turn and take into account all the previous turns.
251+
eval_case = vertexai.types.EvalCase(
252+
agent_data=_MultiTurnVertexiAiEvalFacade._get_agent_data(
253+
actual_invocations
254+
)
255+
)
256+
dataset = vertexai.types.EvaluationDataset(eval_cases=[eval_case])
257+
258+
eval_case_result = self._perform_eval(
259+
dataset=dataset, metrics=[self._metric_name]
260+
)
261+
262+
score = self._get_score(eval_case_result)
263+
per_invocation_results.append(
264+
PerInvocationResult(
265+
actual_invocation=actual_invocations[-1],
266+
expected_invocation=expected_invocations[-1],
267+
score=score,
268+
eval_status=self._get_eval_status(score),
269+
)
270+
)
271+
272+
if score is not None:
273+
return EvaluationResult(
274+
overall_score=score,
275+
overall_eval_status=self._get_eval_status(score),
276+
per_invocation_results=per_invocation_results,
277+
)
278+
279+
return EvaluationResult()
280+
281+
@staticmethod
282+
def _get_agent_data(
283+
actual_invocations: list[Invocation],
284+
) -> vertexai.types.evals.AgentData:
285+
return vertexai.types.evals.AgentData(
286+
agents=_MultiTurnVertexiAiEvalFacade._get_agent_details(
287+
actual_invocations
288+
),
289+
turns=_MultiTurnVertexiAiEvalFacade._get_turns(actual_invocations),
290+
)
291+
292+
@staticmethod
293+
def _get_turns(
294+
actual_invocations: list[Invocation],
295+
) -> list[vertexai.types.evals.ConversationTurn]:
296+
return [
297+
_MultiTurnVertexiAiEvalFacade._map_invocation_turn(index, invocation)
298+
for index, invocation in enumerate(actual_invocations)
299+
]
300+
301+
@staticmethod
302+
def _map_invocation_turn(
303+
turn_index: int,
304+
invocation: Invocation,
305+
) -> vertexai.types.evals.ConversationTurn:
306+
agent_events = []
307+
agent_events.append(
308+
vertexai.types.evals.AgentEvent(
309+
author="user", content=invocation.user_content
310+
)
311+
)
312+
313+
for invocation_event in invocation.intermediate_data.invocation_events:
314+
agent_events.append(
315+
_MultiTurnVertexiAiEvalFacade._map_inovcation_event_to_agent_event(
316+
invocation_event
317+
)
318+
)
319+
320+
agent_events.append(
321+
vertexai.types.evals.AgentEvent(
322+
author="agent", content=invocation.final_response
323+
)
324+
)
325+
326+
return vertexai.types.evals.ConversationTurn(
327+
turn_index=turn_index,
328+
events=agent_events,
329+
turn_id=invocation.invocation_id,
330+
)
331+
332+
@staticmethod
333+
def _map_inovcation_event_to_agent_event(
334+
invocation_event: InvocationEvent,
335+
) -> vertexai.types.evals.AgentEvent:
336+
return vertexai.types.evals.AgentEvent(
337+
author=invocation_event.author, content=invocation_event.content
338+
)
339+
340+
@staticmethod
341+
def _get_agent_details(
342+
actual_invocations: list[Invocation],
343+
) -> dict[str, vertexai.types.evals.AgentConfig]:
344+
agent_configs = {}
345+
for invocation in actual_invocations:
346+
if invocation.app_details and invocation.app_details.agent_details:
347+
for (
348+
agent_name,
349+
agent_details,
350+
) in invocation.app_details.agent_details.items():
351+
if agent_name not in agent_configs:
352+
agent_configs[agent_name] = (
353+
_MultiTurnVertexiAiEvalFacade._map_agent_details_to_agent_config(
354+
agent_details
355+
)
356+
)
357+
358+
return agent_configs
359+
360+
@staticmethod
361+
def _map_agent_details_to_agent_config(
362+
agent_details: AgentDetails,
363+
) -> vertexai.types.evals.AgentConfig:
364+
return vertexai.types.evals.AgentConfig(
365+
agent_id=agent_details.name,
366+
instruction=agent_details.instructions,
367+
tools=agent_details.tool_declarations,
368+
)

0 commit comments

Comments
 (0)