From 2ea7c0cc7e679e89b97472efe3b05ad4e872a10e Mon Sep 17 00:00:00 2001
From: Jason Dai <jsndai@google.com>
Date: Fri, 13 Feb 2026 17:03:45 -0800
Subject: [PATCH] chore: GenAI Client(evals) - Add class methods for parsing
 raw Agent Session history into the new `AgentData` structure. Add
 `agent_resource_name` attribute to `AgentConfig` and loading methods. feat:
 GenAI Client(evals) - Add 3 new multi-turn predefined metrics for agent
 evaluation (`MULTI_TURN_TOOL_USE_QUALITY`, `MULTI_TURN_TRAJECTORY_QUALITY`,
 `MULTI_TURN_TASK_SUCCESS`). chore: GenAI Client(evals) - Update evaluation
 data converters and metric handlers to natively support `AgentData` in
 `EvaluationDataset` and `EvalCase`. chore: GenAI Client(evals) - Map
 `agent_data` to `agent_eval_data` in Vertex REST payload generation.

PiperOrigin-RevId: 869945268
---
 .../vertexai/genai/replays/test_evaluate.py   | 257 ++++++++++++++++++
 .../genai/replays/test_evaluate_instances.py  |  33 ---
 vertexai/_genai/_evals_common.py              |  24 +-
 vertexai/_genai/_evals_constant.py            |   3 +
 vertexai/_genai/_evals_data_converters.py     | 136 ++++++---
 vertexai/_genai/_evals_metric_handlers.py     |  22 +-
 vertexai/_genai/_evals_metric_loaders.py      |  12 +
 vertexai/_genai/evals.py                      |  32 ++-
 vertexai/_genai/types/evals.py                | 144 ++++++++++
 9 files changed, 577 insertions(+), 86 deletions(-)

diff --git a/tests/unit/vertexai/genai/replays/test_evaluate.py b/tests/unit/vertexai/genai/replays/test_evaluate.py
index e76d664d4c..931bea8cb9 100644
--- a/tests/unit/vertexai/genai/replays/test_evaluate.py
+++ b/tests/unit/vertexai/genai/replays/test_evaluate.py
@@ -16,6 +16,7 @@
 
 from tests.unit.vertexai.genai.replays import pytest_helper
 from vertexai._genai import types
+from google.genai import types as genai_types
 import pandas as pd
 
 
@@ -96,6 +97,262 @@ def test_evaluation_byor(client):
         assert case_result.response_candidate_results is not None
 
 
+def test_evaluation_agent_data(client):
+    """Tests evaluate method with AgentData."""
+    client._api_client._http_options.base_url = (
+        "https://autopush-aiplatform.sandbox.googleapis.com/"
+    )
+    client._api_client._http_options.api_version = "v1beta1"
+
+    agent_data = types.evals.AgentData(
+        agents={
+            "coordinator": types.evals.AgentConfig(
+                agent_id="coordinator",
+                agent_type="RouterAgent",
+                description="Root agent that delegates to specialists.",
+                instruction=(
+                    "You are a travel coordinator. Delegate flight tasks to"
+                    " 'flight_bot' and hotel tasks to 'hotel_bot'."
+                ),
+                sub_agents=["flight_bot", "hotel_bot"],
+                tools=[
+                    genai_types.Tool(
+                        function_declarations=[
+                            genai_types.FunctionDeclaration(
+                                name="delegate_to_agent",
+                                description=("Delegates conversation to a sub-agent."),
+                            )
+                        ]
+                    )
+                ],
+            ),
+            "flight_bot": types.evals.AgentConfig(
+                agent_id="flight_bot",
+                agent_type="SpecialistAgent",
+                description="Handles flight searches.",
+                instruction="Search for flights using the available tools.",
+                tools=[
+                    genai_types.Tool(
+                        function_declarations=[
+                            genai_types.FunctionDeclaration(
+                                name="search_flights",
+                                description=(
+                                    "Finds flights based on origin and" " destination."
+                                ),
+                            )
+                        ]
+                    )
+                ],
+            ),
+            "hotel_bot": types.evals.AgentConfig(
+                agent_id="hotel_bot",
+                agent_type="SpecialistAgent",
+                description="Handles hotel searches.",
+                instruction="Search for hotels using the available tools.",
+                tools=[
+                    genai_types.Tool(
+                        function_declarations=[
+                            genai_types.FunctionDeclaration(
+                                name="search_hotels",
+                                description="Finds hotels in a given location.",
+                            )
+                        ]
+                    )
+                ],
+            ),
+        },
+        turns=[
+            types.evals.ConversationTurn(
+                turn_index=0,
+                events=[
+                    types.evals.AgentEvent(
+                        author="user",
+                        content=genai_types.Content(
+                            role="user",
+                            parts=[
+                                genai_types.Part(
+                                    text=(
+                                        "I need to book a flight to NYC for next"
+                                        " Monday."
+                                    )
+                                )
+                            ],
+                        ),
+                    ),
+                    types.evals.AgentEvent(
+                        author="coordinator",
+                        content=genai_types.Content(
+                            role="model",
+                            parts=[
+                                genai_types.Part(
+                                    function_call=genai_types.FunctionCall(
+                                        name="delegate_to_agent",
+                                        args={"agent_name": "flight_bot"},
+                                    )
+                                )
+                            ],
+                        ),
+                    ),
+                    types.evals.AgentEvent(
+                        author="flight_bot",
+                        content=genai_types.Content(
+                            role="model",
+                            parts=[
+                                genai_types.Part(
+                                    function_call=genai_types.FunctionCall(
+                                        name="search_flights",
+                                        args={
+                                            "destination": "NYC",
+                                            "date": "next Monday",
+                                        },
+                                    )
+                                )
+                            ],
+                        ),
+                    ),
+                    types.evals.AgentEvent(
+                        author="flight_bot",
+                        content=genai_types.Content(
+                            role="tool",
+                            parts=[
+                                genai_types.Part(
+                                    function_response=genai_types.FunctionResponse(
+                                        name="search_flights",
+                                        response={
+                                            "flights": [
+                                                {
+                                                    "id": "UA100",
+                                                    "price": "$300",
+                                                }
+                                            ]
+                                        },
+                                    )
+                                )
+                            ],
+                        ),
+                    ),
+                    types.evals.AgentEvent(
+                        author="flight_bot",
+                        content=genai_types.Content(
+                            role="model",
+                            parts=[
+                                genai_types.Part(
+                                    text="I found flight UA100 to NYC for $300."
+                                )
+                            ],
+                        ),
+                    ),
+                ],
+            ),
+            types.evals.ConversationTurn(
+                turn_index=1,
+                events=[
+                    types.evals.AgentEvent(
+                        author="user",
+                        content=genai_types.Content(
+                            role="user",
+                            parts=[
+                                genai_types.Part(
+                                    text=(
+                                        "Great, book that. I also need a hotel"
+                                        " there."
+                                    )
+                                )
+                            ],
+                        ),
+                    ),
+                    types.evals.AgentEvent(
+                        author="coordinator",
+                        content=genai_types.Content(
+                            role="model",
+                            parts=[
+                                genai_types.Part(
+                                    function_call=genai_types.FunctionCall(
+                                        name="delegate_to_agent",
+                                        args={"agent_name": "hotel_bot"},
+                                    )
+                                )
+                            ],
+                        ),
+                    ),
+                    types.evals.AgentEvent(
+                        author="hotel_bot",
+                        content=genai_types.Content(
+                            role="model",
+                            parts=[
+                                genai_types.Part(
+                                    function_call=genai_types.FunctionCall(
+                                        name="search_hotels",
+                                        args={"location": "NYC"},
+                                    )
+                                )
+                            ],
+                        ),
+                    ),
+                    types.evals.AgentEvent(
+                        author="hotel_bot",
+                        content=genai_types.Content(
+                            role="tool",
+                            parts=[
+                                genai_types.Part(
+                                    function_response=genai_types.FunctionResponse(
+                                        name="search_hotels",
+                                        response={
+                                            "hotels": [
+                                                {
+                                                    "name": "Central Park Hotel",
+                                                    "rating": 4.5,
+                                                }
+                                            ]
+                                        },
+                                    )
+                                )
+                            ],
+                        ),
+                    ),
+                    types.evals.AgentEvent(
+                        author="hotel_bot",
+                        content=genai_types.Content(
+                            role="model",
+                            parts=[
+                                genai_types.Part(
+                                    text="I recommend the Central Park Hotel."
+                                )
+                            ],
+                        ),
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    # Create the EvalCase and wrap it in an EvaluationDataset
+    eval_case = types.EvalCase(agent_data=agent_data)
+    eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])
+
+    metrics = [
+        types.RubricMetric.MULTI_TURN_TRAJECTORY_QUALITY,
+    ]
+
+    evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
+
+    assert isinstance(evaluation_result, types.EvaluationResult)
+
+    assert evaluation_result.summary_metrics is not None
+    assert len(evaluation_result.summary_metrics) > 0
+    for summary in evaluation_result.summary_metrics:
+        assert isinstance(summary, types.AggregatedMetricResult)
+        assert summary.metric_name is not None
+        assert summary.mean_score is not None
+
+    assert evaluation_result.eval_case_results is not None
+    assert len(evaluation_result.eval_case_results) > 0
+    for case_result in evaluation_result.eval_case_results:
+        assert isinstance(case_result, types.EvalCaseResult)
+        assert case_result.eval_case_index is not None
+        assert case_result.response_candidate_results is not None
+
+
 pytestmark = pytest_helper.setup(
     file=__file__,
     globals_for_file=globals(),
diff --git a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py
index 3697189f59..2ebabb8641 100644
--- a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py
+++ b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py
@@ -105,39 +105,6 @@ def test_pointwise_metric(client):
     assert response.pointwise_metric_result.score is not None
 
 
-# def test_predefined_metric_with_agent_data(client):
-#     """Tests the _evaluate_instances method with predefined metric and agent_data."""
-#     agent_data = types.evals.AgentData(
-#         agent_config=types.evals.AgentConfig(
-#             tools=[
-#                 genai_types.Tool(
-#                     function_declarations=[
-#                         genai_types.FunctionDeclaration(name="search")
-#                     ]
-#                 )
-#             ],
-#             developer_instruction=types.evals.InstanceData(text="instruction"),
-#         ),
-#         events=types.evals.Events(
-#             event=[genai_types.Content(parts=[genai_types.Part(text="hello")])]
-#         ),
-#     )
-#     instance = types.EvaluationInstance(
-#         prompt=types.evals.InstanceData(text="What is the capital of France?"),
-#         response=types.evals.InstanceData(text="Paris"),
-#         reference=types.evals.InstanceData(text="Paris"),
-#         agent_data=agent_data,
-#     )
-
-#     response = client.evals.evaluate_instances(
-#         metric_config=types._EvaluateInstancesRequestParameters(
-#             metrics=[types.Metric(name="general_quality_v1")],
-#             instance=instance,
-#         )
-#     )
-#     assert response.metric_results[0].score is not None
-
-
 def test_pairwise_metric_with_autorater(client):
     """Tests the _evaluate_instances method with PairwiseMetricInput and AutoraterConfig."""
 
diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
index 0bc28994ed..f77bca328d 100644
--- a/vertexai/_genai/_evals_common.py
+++ b/vertexai/_genai/_evals_common.py
@@ -1047,8 +1047,10 @@ def _resolve_dataset_inputs(
     datasets_to_process = dataset
     logger.info("Processing %s dataset(s).", num_response_candidates)
 
-    loaded_raw_datasets: list[list[dict[str, Any]]] = []
-    schemas_for_merge: list[str] = []
+    if len(datasets_to_process) == 1 and datasets_to_process[0].eval_cases:
+        return datasets_to_process[0], 1
+
+    parsed_evaluation_datasets: list[types.EvaluationDataset] = []
 
     for i, ds_item in enumerate(datasets_to_process):
         if not isinstance(ds_item, types.EvaluationDataset):
@@ -1062,9 +1064,13 @@ def _resolve_dataset_inputs(
                 f"Item at index {i} is not an EvaluationDataset: {type(ds_item)}"
             )
 
+        if ds_item.eval_cases:
+            logger.info("Dataset %d already contains eval_cases.", i)
+            parsed_evaluation_datasets.append(ds_item)
+            continue
+
         ds_source_for_loader = _get_dataset_source(ds_item)
         current_loaded_data = loader.load(ds_source_for_loader)
-        loaded_raw_datasets.append(current_loaded_data)
 
         if dataset_schema:
             current_schema = _evals_data_converters.EvalDatasetSchema(dataset_schema)
@@ -1072,7 +1078,6 @@ def _resolve_dataset_inputs(
             current_schema = _evals_data_converters.auto_detect_dataset_schema(  # type: ignore[assignment]
                 current_loaded_data
             )
-        schemas_for_merge.append(current_schema)
 
         logger.info(
             "Dataset %d: Schema: %s. Using %s converter.",
@@ -1082,13 +1087,12 @@ def _resolve_dataset_inputs(
                 current_schema
             ).__class__.__name__,
         )
+        converter = _evals_data_converters.get_dataset_converter(current_schema)
+        parsed_evaluation_datasets.append(converter.convert(current_loaded_data))
 
-    processed_eval_dataset = (
-        _evals_data_converters.merge_response_datasets_into_canonical_format(
-            raw_datasets=loaded_raw_datasets,
-            schemas=schemas_for_merge,
-            agent_info=agent_info,
-        )
+    processed_eval_dataset = _evals_data_converters.merge_evaluation_datasets(
+        datasets=parsed_evaluation_datasets,
+        agent_info=agent_info,
     )
 
     if not processed_eval_dataset.eval_cases:
diff --git a/vertexai/_genai/_evals_constant.py b/vertexai/_genai/_evals_constant.py
index 6fc27d94e0..fc3f3e2771 100644
--- a/vertexai/_genai/_evals_constant.py
+++ b/vertexai/_genai/_evals_constant.py
@@ -23,6 +23,9 @@
         "safety_v1",
         "multi_turn_general_quality_v1",
         "multi_turn_text_quality_v1",
+        "multi_turn_tool_use_quality_v1",
+        "multi_turn_trajectory_quality_v1",
+        "multi_turn_task_success_v1",
         "final_response_match_v2",
         "final_response_reference_free_v1",
         "final_response_quality_v1",
diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py
index 459600caff..876f9a7341 100644
--- a/vertexai/_genai/_evals_data_converters.py
+++ b/vertexai/_genai/_evals_data_converters.py
@@ -202,24 +202,25 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
             system_instruction_data = item.pop("instruction", None)
             rubric_groups_data = item.pop("rubric_groups", None)
             intermediate_events_data = item.pop("intermediate_events", None)
+            agent_data_raw = item.pop("agent_data", None)
 
-            if not response_data:
+            if not response_data and not agent_data_raw:
                 raise ValueError(
                     "Response is required but missing for %s." % eval_case_id
                 )
-            if not prompt_data:
+            if not prompt_data and not agent_data_raw:
                 raise ValueError(
                     "Prompt is required but missing for %s." % eval_case_id
                 )
 
-            prompt: genai_types.Content
+            prompt: Optional[genai_types.Content] = None
             if isinstance(prompt_data, str):
                 prompt = genai_types.Content(parts=[genai_types.Part(text=prompt_data)])
             elif isinstance(prompt_data, dict):
                 prompt = genai_types.Content.model_validate(prompt_data)
             elif isinstance(prompt_data, genai_types.Content):
                 prompt = prompt_data
-            else:
+            elif not agent_data_raw:
                 raise ValueError(
                     "Invalid prompt type for case %s: %s" % (i, type(prompt_data))
                 )
@@ -265,7 +266,7 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
                             type(content),
                         )
 
-            responses: list[types.ResponseCandidate]
+            responses: Optional[list[types.ResponseCandidate]] = None
             if isinstance(response_data, dict):
                 responses = [
                     types.ResponseCandidate(
@@ -282,7 +283,7 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
                 ]
             elif isinstance(response_data, genai_types.Content):
                 responses = [types.ResponseCandidate(response=response_data)]
-            else:
+            elif not agent_data_raw:
                 raise ValueError(
                     "Invalid response type for case %s: %s" % (i, type(response_data))
                 )
@@ -411,6 +412,41 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
                         i,
                     )
 
+            agent_data: Optional[types.evals.AgentData] = None
+            if agent_data_raw:
+                if isinstance(agent_data_raw, str):
+                    try:
+                        agent_data_dict = json.loads(agent_data_raw)
+                        agent_data = types.evals.AgentData.model_validate(
+                            agent_data_dict
+                        )
+                    except json.JSONDecodeError:
+                        logger.warning(
+                            "Could not decode agent_data JSON string for case %s.", i
+                        )
+                    except ValidationError as e:
+                        logger.warning(
+                            "Failed to validate agent_data for case %s: %s", i, e
+                        )
+                elif isinstance(agent_data_raw, dict):
+                    try:
+                        agent_data = types.evals.AgentData.model_validate(
+                            agent_data_raw
+                        )
+                    except ValidationError as e:
+                        logger.warning(
+                            "Failed to validate agent_data for case %s: %s", i, e
+                        )
+                elif isinstance(agent_data_raw, types.evals.AgentData):
+                    agent_data = agent_data_raw
+                else:
+                    logger.warning(
+                        "Invalid type for agent_data in case %s. Expected str, dict"
+                        " or types.evals.AgentData object. Got %s",
+                        i,
+                        type(agent_data_raw),
+                    )
+
             eval_case = types.EvalCase(
                 eval_case_id=eval_case_id,
                 prompt=prompt,
@@ -420,6 +456,7 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
                 system_instruction=system_instruction,
                 rubric_groups=rubric_groups,
                 intermediate_events=intermediate_events,
+                agent_data=agent_data,
                 **item,  # Pass remaining columns as extra fields to EvalCase.
                 # They can be used for custom metric prompt templates.
             )
@@ -605,6 +642,9 @@ def auto_detect_dataset_schema(
                 if "role" in messages_list[0] and "content" in messages_list[0]:
                     return EvalDatasetSchema.OPENAI
 
+    if "agent_data" in keys:
+        return EvalDatasetSchema.FLATTEN
+
     if {"prompt", "response"}.issubset(keys) or {
         "response",
         "reference",
@@ -701,51 +741,40 @@ def _validate_case_consistency(
         )
 
 
-def merge_response_datasets_into_canonical_format(
-    raw_datasets: list[list[dict[str, Any]]],
-    schemas: list[str],
+def merge_evaluation_datasets(
+    datasets: list[types.EvaluationDataset],
     agent_info: Optional[types.evals.AgentInfo] = None,
 ) -> types.EvaluationDataset:
-    """Merges multiple raw response datasets into a single EvaluationDataset.
+    """Merges multiple EvaluationDatasets into a single EvaluationDataset.
 
-    Assumes that each dataset in raw_datasets has responses corresponding
-    to the same set of prompts, in the same order. The prompt, reference,
-    system_instruction, and conversation_history are taken from the first dataset.
+    Assumes that each dataset has responses corresponding to the same set of
+    prompts, in the same order. The prompt, reference, system_instruction, and
+    conversation_history are taken from the first dataset.
     """
-    if not isinstance(raw_datasets, list):
-        raise TypeError(
-            "Input 'raw_datasets' must be a list, got %s." % type(raw_datasets)
-        )
-    if not raw_datasets or not all(isinstance(ds, list) for ds in raw_datasets):
-        raise ValueError(
-            "Input 'raw_datasets' cannot be empty and must be a list of lists."
-        )
-    if not schemas or len(schemas) != len(raw_datasets):
-        raise ValueError(
-            "A list of schemas must be provided, one for each raw dataset. "
-            "Got %s schemas for %s datasets." % (len(schemas), len(raw_datasets))
-        )
+    if not datasets:
+        raise ValueError("Input 'datasets' cannot be empty.")
+
+    num_expected_cases = 0
+    if datasets[0].eval_cases:
+        num_expected_cases = len(datasets[0].eval_cases)
 
-    num_expected_cases = len(raw_datasets[0])
     if num_expected_cases == 0:
         logger.warning(
             "The first dataset has no evaluation cases. Result will be empty."
         )
         return types.EvaluationDataset(eval_cases=[])
 
-    parsed_evaluation_datasets: list[types.EvaluationDataset] = []
-    for i, (raw_ds_entry, schema) in enumerate(zip(raw_datasets, schemas)):
-        if len(raw_ds_entry) != num_expected_cases:
+    for i, ds in enumerate(datasets):
+        current_len = len(ds.eval_cases) if ds.eval_cases else 0
+        if current_len != num_expected_cases:
             raise ValueError(
                 "All datasets must have the same number of evaluation cases. "
-                "Base dataset (0) has %s, but dataset %s (schema: %s) has %s."
-                % (num_expected_cases, i, schema, len(raw_ds_entry))
+                "Base dataset (0) has %s, but dataset %s has %s."
+                % (num_expected_cases, i, current_len)
             )
-        converter = get_dataset_converter(schema)
-        parsed_evaluation_datasets.append(converter.convert(raw_ds_entry))
 
     merged_eval_cases: list[types.EvalCase] = []
-    base_parsed_dataset = parsed_evaluation_datasets[0]
+    base_parsed_dataset = datasets[0]
 
     for case_idx in range(num_expected_cases):
         base_eval_case: types.EvalCase = (
@@ -781,9 +810,7 @@ def merge_response_datasets_into_canonical_format(
             },
             exclude_none=True,
         )
-        for dataset_idx_offset, current_parsed_ds in enumerate(
-            parsed_evaluation_datasets[1:], start=1
-        ):
+        for dataset_idx_offset, current_parsed_ds in enumerate(datasets[1:], start=1):
             current_ds_eval_case: types.EvalCase = (
                 current_parsed_ds.eval_cases[case_idx]
                 if current_parsed_ds.eval_cases
@@ -838,3 +865,36 @@ def merge_response_datasets_into_canonical_format(
         merged_eval_cases.append(merged_case)
 
     return types.EvaluationDataset(eval_cases=merged_eval_cases)
+
+
+def merge_response_datasets_into_canonical_format(
+    raw_datasets: list[list[dict[str, Any]]],
+    schemas: list[str],
+    agent_info: Optional[types.evals.AgentInfo] = None,
+) -> types.EvaluationDataset:
+    """Merges multiple raw response datasets into a single EvaluationDataset.
+
+    Assumes that each dataset in raw_datasets has responses corresponding
+    to the same set of prompts, in the same order. The prompt, reference,
+    system_instruction, and conversation_history are taken from the first dataset.
+    """
+    if not isinstance(raw_datasets, list):
+        raise TypeError(
+            "Input 'raw_datasets' must be a list, got %s." % type(raw_datasets)
+        )
+    if not raw_datasets or not all(isinstance(ds, list) for ds in raw_datasets):
+        raise ValueError(
+            "Input 'raw_datasets' cannot be empty and must be a list of lists."
+        )
+    if not schemas or len(schemas) != len(raw_datasets):
+        raise ValueError(
+            "A list of schemas must be provided, one for each raw dataset. "
+            "Got %s schemas for %s datasets." % (len(schemas), len(raw_datasets))
+        )
+
+    parsed_evaluation_datasets: list[types.EvaluationDataset] = []
+    for i, (raw_ds_entry, schema) in enumerate(zip(raw_datasets, schemas)):
+        converter = get_dataset_converter(schema)
+        parsed_evaluation_datasets.append(converter.convert(raw_ds_entry))
+
+    return merge_evaluation_datasets(parsed_evaluation_datasets, agent_info)
diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py
index 901556db8a..671471be81 100644
--- a/vertexai/_genai/_evals_metric_handlers.py
+++ b/vertexai/_genai/_evals_metric_handlers.py
@@ -879,6 +879,9 @@ def _eval_case_to_agent_data(
         eval_case: types.EvalCase,
     ) -> Optional[types.evals.AgentData]:
         """Converts an EvalCase object to an AgentData object."""
+        if getattr(eval_case, "agent_data", None):
+            return eval_case.agent_data
+
         if not eval_case.agent_info and not eval_case.intermediate_events:
             return None
         tools = None
@@ -920,11 +923,16 @@ def _build_request_payload(
         self, eval_case: types.EvalCase, response_index: int
     ) -> dict[str, Any]:
         """Builds the request parameters for evaluate instances request."""
-        if not eval_case.responses or response_index >= len(eval_case.responses):
+        if (
+            not eval_case.responses or response_index >= len(eval_case.responses)
+        ) and not getattr(eval_case, "agent_data", None):
             raise IndexError(f"response_index {response_index} is out of bounds.")
 
-        response_content = eval_case.responses[response_index].response
-        if not response_content:
+        response_content = None
+        if eval_case.responses and response_index < len(eval_case.responses):
+            response_content = eval_case.responses[response_index].response
+
+        if not response_content and not getattr(eval_case, "agent_data", None):
             raise ValueError(
                 f"Response content missing for candidate {response_index}."
             )
@@ -1442,9 +1450,15 @@ def compute_metrics_and_aggregate(
                 for eval_case_index, eval_case in enumerate(
                     evaluation_run_config.dataset.eval_cases
                 ):
+                    num_responses = (
+                        len(eval_case.responses) if eval_case.responses else 0
+                    )
+                    if num_responses == 0 and getattr(eval_case, "agent_data", None):
+                        num_responses = 1
+
                     actual_num_candidates_for_case = min(
                         evaluation_run_config.num_response_candidates,
-                        len(eval_case.responses),
+                        num_responses,
                     )
                     for response_index in range(actual_num_candidates_for_case):
                         try:
diff --git a/vertexai/_genai/_evals_metric_loaders.py b/vertexai/_genai/_evals_metric_loaders.py
index cbe14b160c..a3482eef6e 100644
--- a/vertexai/_genai/_evals_metric_loaders.py
+++ b/vertexai/_genai/_evals_metric_loaders.py
@@ -299,6 +299,18 @@ def MULTI_TURN_GENERAL_QUALITY(self) -> LazyLoadedPrebuiltMetric:
     def MULTI_TURN_TEXT_QUALITY(self) -> LazyLoadedPrebuiltMetric:
         return self.__getattr__("MULTI_TURN_TEXT_QUALITY")
 
+    @property
+    def MULTI_TURN_TOOL_USE_QUALITY(self) -> LazyLoadedPrebuiltMetric:
+        return self.__getattr__("MULTI_TURN_TOOL_USE_QUALITY", version="v1")
+
+    @property
+    def MULTI_TURN_TRAJECTORY_QUALITY(self) -> LazyLoadedPrebuiltMetric:
+        return self.__getattr__("MULTI_TURN_TRAJECTORY_QUALITY", version="v1")
+
+    @property
+    def MULTI_TURN_TASK_SUCCESS(self) -> LazyLoadedPrebuiltMetric:
+        return self.__getattr__("MULTI_TURN_TASK_SUCCESS", version="v1")
+
     @property
     def FINAL_RESPONSE_MATCH(self) -> LazyLoadedPrebuiltMetric:
         return self.__getattr__("FINAL_RESPONSE_MATCH", version="v2")
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
index dba63496fb..f3d29c407d 100644
--- a/vertexai/_genai/evals.py
+++ b/vertexai/_genai/evals.py
@@ -224,7 +224,11 @@ def _EvaluateInstancesRequestParameters_to_vertex(
         )
 
     if getv(from_object, ["instance"]) is not None:
-        setv(to_object, ["instance"], getv(from_object, ["instance"]))
+        setv(
+            to_object,
+            ["instance"],
+            _EvaluationInstance_to_vertex(getv(from_object, ["instance"]), to_object),
+        )
 
     if getv(from_object, ["config"]) is not None:
         setv(to_object, ["config"], getv(from_object, ["config"]))
@@ -232,6 +236,32 @@ def _EvaluateInstancesRequestParameters_to_vertex(
     return to_object
 
 
+def _EvaluationInstance_to_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["prompt"]) is not None:
+        setv(to_object, ["prompt"], getv(from_object, ["prompt"]))
+
+    if getv(from_object, ["response"]) is not None:
+        setv(to_object, ["response"], getv(from_object, ["response"]))
+
+    if getv(from_object, ["reference"]) is not None:
+        setv(to_object, ["reference"], getv(from_object, ["reference"]))
+
+    if getv(from_object, ["other_data"]) is not None:
+        setv(to_object, ["otherData"], getv(from_object, ["other_data"]))
+
+    if getv(from_object, ["agent_data"]) is not None:
+        setv(to_object, ["agent_eval_data"], getv(from_object, ["agent_data"]))
+
+    if getv(from_object, ["rubric_groups"]) is not None:
+        setv(to_object, ["rubricGroups"], getv(from_object, ["rubric_groups"]))
+
+    return to_object
+
+
 def _EvaluationRunConfig_from_vertex(
     from_object: Union[dict[str, Any], object],
     parent_object: Optional[dict[str, Any]] = None,
diff --git a/vertexai/_genai/types/evals.py b/vertexai/_genai/types/evals.py
index 970286cd1d..dd7230da8a 100644
--- a/vertexai/_genai/types/evals.py
+++ b/vertexai/_genai/types/evals.py
@@ -122,6 +122,10 @@ class AgentConfig(_common.BaseModel):
       This ID is used to refer to this agent, e.g., in AgentEvent.author, or in
       the `sub_agents` field. It must be unique within the `agents` map.""",
     )
+    agent_resource_name: Optional[str] = Field(
+        default=None,
+        description="""The Agent Engine resource name, formatted as `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`.""",
+    )
     agent_type: Optional[str] = Field(
         default=None,
         description="""The type or class of the agent (e.g., "LlmAgent", "RouterAgent",
@@ -160,6 +164,51 @@ class AgentConfig(_common.BaseModel):
         description="""A field containing instructions from the developer for the agent.""",
     )
 
+    @staticmethod
+    def _get_tool_declarations_from_agent(agent: Any) -> genai_types.ToolListUnion:
+        """Gets tool declarations from an agent.
+
+        Args:
+          agent: The agent to get the tool declarations from. Data type is google.adk.agents.LLMAgent type, use Any to avoid dependency on ADK.
+
+        Returns:
+          The tool declarations of the agent.
+        """
+        tool_declarations: genai_types.ToolListUnion = []
+        for tool in agent.tools:
+            tool_declarations.append(
+                {
+                    "function_declarations": [
+                        genai_types.FunctionDeclaration.from_callable_with_api_option(
+                            callable=tool
+                        )
+                    ]
+                }
+            )
+        return tool_declarations
+
+    @classmethod
+    def from_agent(
+        cls, agent: Any, agent_resource_name: Optional[str] = None
+    ) -> "AgentConfig":
+        """Creates an AgentConfig from an ADK agent object.
+
+        Args:
+          agent: The agent to get the agent info from, data type is google.adk.agents.LLMAgent type, use Any to avoid dependency on ADK.
+          agent_resource_name: Optional. The agent engine resource name.
+
+        Returns:
+            An AgentConfig object populated with the agent's metadata.
+        """
+        return cls(  # pytype: disable=missing-parameter
+            agent_id=getattr(agent, "name", "agent_0") or "agent_0",
+            agent_resource_name=agent_resource_name,
+            agent_type=agent.__class__.__name__,
+            description=getattr(agent, "description", None),
+            instruction=getattr(agent, "instruction", None),
+            tools=AgentConfig._get_tool_declarations_from_agent(agent),
+        )
+
 
 class AgentConfigDict(TypedDict, total=False):
     """Represents configuration for an Agent."""
@@ -169,6 +218,9 @@ class AgentConfigDict(TypedDict, total=False):
       This ID is used to refer to this agent, e.g., in AgentEvent.author, or in
       the `sub_agents` field. It must be unique within the `agents` map."""
 
+    agent_resource_name: Optional[str]
+    """The Agent Engine resource name, formatted as `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`."""
+
     agent_type: Optional[str]
     """The type or class of the agent (e.g., "LlmAgent", "RouterAgent",
       "ToolUseAgent"). Useful for the autorater to understand the expected
@@ -334,6 +386,98 @@ class AgentData(_common.BaseModel):
     )
     events: Optional[Events] = Field(default=None, description="""A list of events.""")
 
+    @classmethod
+    def from_session(cls, agent: Any, session_history: list[Any]) -> "AgentData":
+        """Creates an AgentData object from a session history.
+
+        Segments the flat list of session events into ConversationTurns. A new turn
+        is initiated by a User message.
+
+        Args:
+            agent: The agent instance used in the session.
+            session_history: A list of raw events/messages from the session.
+
+        Returns:
+            An AgentData object containing the segmented history and agent config.
+        """
+        agent_config = AgentConfig.from_agent(agent)
+        agent_id = agent_config.agent_id or "agent_0"
+        agents_map = {agent_id: agent_config}
+
+        turns: list[ConversationTurn] = []
+        current_turn_events: list[AgentEvent] = []
+
+        for event in session_history:
+            is_user = False
+            if isinstance(event, dict):
+                if event.get("role") == "user":
+                    is_user = True
+                elif (
+                    isinstance(event.get("content"), dict)
+                    and event["content"].get("role") == "user"
+                ):
+                    is_user = True
+            elif hasattr(event, "role") and event.role == "user":
+                is_user = True
+
+            if is_user and current_turn_events:
+                turns.append(
+                    ConversationTurn(  # pytype: disable=missing-parameter
+                        turn_index=len(turns),
+                        turn_id=f"turn_{len(turns)}",
+                        events=current_turn_events,
+                    )
+                )
+                current_turn_events = []
+
+            author = "user" if is_user else agent_id
+
+            content = None
+            if isinstance(event, dict):
+                if "content" in event:
+                    raw_content = event["content"]
+                    if isinstance(raw_content, genai_types.Content):
+                        content = raw_content
+                    elif isinstance(raw_content, dict):
+                        try:
+                            content = genai_types.Content.model_validate(raw_content)
+                        except Exception as e:
+                            raise ValueError(
+                                f"Failed to validate Content from dictionary in session history: {raw_content}"
+                            ) from e
+                    elif isinstance(raw_content, str):
+                        content = genai_types.Content(
+                            parts=[genai_types.Part(text=raw_content)]
+                        )
+                elif "parts" in event:
+                    try:
+                        content = genai_types.Content.model_validate(event)
+                    except Exception as e:
+                        raise ValueError(
+                            f"Failed to validate Content from event with 'parts': {event}"
+                        ) from e
+            elif hasattr(event, "content") and isinstance(
+                event.content, genai_types.Content
+            ):
+                content = event.content
+
+            agent_event = AgentEvent(  # pytype: disable=missing-parameter
+                author=author,
+                content=content,
+            )
+            current_turn_events.append(agent_event)
+
+        if current_turn_events:
+            turns.append(
+                ConversationTurn(  # pytype: disable=missing-parameter
+                    turn_index=len(turns),
+                    turn_id=f"turn_{len(turns)}",
+                    events=current_turn_events,
+                )
+            )
+
+        return cls(agents=agents_map, turns=turns)  # pytype: disable=missing-parameter
+
 
 class AgentDataDict(TypedDict, total=False):
     """Represents data specific to multi-turn agent evaluations."""