From 2ea7c0cc7e679e89b97472efe3b05ad4e872a10e Mon Sep 17 00:00:00 2001 From: Jason Dai Date: Fri, 13 Feb 2026 17:03:45 -0800 Subject: [PATCH] chore: GenAI Client(evals) - Add class methods for parsing raw Agent Session history into the new `AgentData` structure. Add `agent_resource_name` attribute to `AgentConfig` and loading methods. feat: GenAI Client(evals) - Add 3 new multi-turn predefined metrics for agent evaluation (`MULTI_TURN_TOOL_USE_QUALITY`, `MULTI_TURN_TRAJECTORY_QUALITY`, `MULTI_TURN_TASK_SUCCESS`). chore: GenAI Client(evals) - Update evaluation data converters and metric handlers to natively support `AgentData` in `EvaluationDataset` and `EvalCase`. chore: GenAI Client(evals) - Map `agent_data` to `agent_eval_data` in Vertex REST payload generation. PiperOrigin-RevId: 869945268 --- .../vertexai/genai/replays/test_evaluate.py | 257 ++++++++++++++++++ .../genai/replays/test_evaluate_instances.py | 33 --- vertexai/_genai/_evals_common.py | 24 +- vertexai/_genai/_evals_constant.py | 3 + vertexai/_genai/_evals_data_converters.py | 136 ++++++--- vertexai/_genai/_evals_metric_handlers.py | 22 +- vertexai/_genai/_evals_metric_loaders.py | 12 + vertexai/_genai/evals.py | 32 ++- vertexai/_genai/types/evals.py | 144 ++++++++++ 9 files changed, 577 insertions(+), 86 deletions(-) diff --git a/tests/unit/vertexai/genai/replays/test_evaluate.py b/tests/unit/vertexai/genai/replays/test_evaluate.py index e76d664d4c..931bea8cb9 100644 --- a/tests/unit/vertexai/genai/replays/test_evaluate.py +++ b/tests/unit/vertexai/genai/replays/test_evaluate.py @@ -16,6 +16,7 @@ from tests.unit.vertexai.genai.replays import pytest_helper from vertexai._genai import types +from google.genai import types as genai_types import pandas as pd @@ -96,6 +97,262 @@ def test_evaluation_byor(client): assert case_result.response_candidate_results is not None +def test_evaluation_agent_data(client): + """Tests evaluate method with AgentData.""" + client._api_client._http_options.base_url = ( + "https://autopush-aiplatform.sandbox.googleapis.com/" + ) + client._api_client._http_options.api_version = "v1beta1" + + agent_data = types.evals.AgentData( + agents={ + "coordinator": types.evals.AgentConfig( + agent_id="coordinator", + agent_type="RouterAgent", + description="Root agent that delegates to specialists.", + instruction=( + "You are a travel coordinator. Delegate flight tasks to" + " 'flight_bot' and hotel tasks to 'hotel_bot'." + ), + sub_agents=["flight_bot", "hotel_bot"], + tools=[ + genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="delegate_to_agent", + description=("Delegates conversation to a sub-agent."), + ) + ] + ) + ], + ), + "flight_bot": types.evals.AgentConfig( + agent_id="flight_bot", + agent_type="SpecialistAgent", + description="Handles flight searches.", + instruction="Search for flights using the available tools.", + tools=[ + genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="search_flights", + description=( + "Finds flights based on origin and" " destination." + ), + ) + ] + ) + ], + ), + "hotel_bot": types.evals.AgentConfig( + agent_id="hotel_bot", + agent_type="SpecialistAgent", + description="Handles hotel searches.", + instruction="Search for hotels using the available tools.", + tools=[ + genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="search_hotels", + description="Finds hotels in a given location.", + ) + ] + ) + ], + ), + }, + turns=[ + types.evals.ConversationTurn( + turn_index=0, + events=[ + types.evals.AgentEvent( + author="user", + content=genai_types.Content( + role="user", + parts=[ + genai_types.Part( + text=( + "I need to book a flight to NYC for next" + " Monday." + ) + ) + ], + ), + ), + types.evals.AgentEvent( + author="coordinator", + content=genai_types.Content( + role="model", + parts=[ + genai_types.Part( + function_call=genai_types.FunctionCall( + name="delegate_to_agent", + args={"agent_name": "flight_bot"}, + ) + ) + ], + ), + ), + types.evals.AgentEvent( + author="flight_bot", + content=genai_types.Content( + role="model", + parts=[ + genai_types.Part( + function_call=genai_types.FunctionCall( + name="search_flights", + args={ + "destination": "NYC", + "date": "next Monday", + }, + ) + ) + ], + ), + ), + types.evals.AgentEvent( + author="flight_bot", + content=genai_types.Content( + role="tool", + parts=[ + genai_types.Part( + function_response=genai_types.FunctionResponse( + name="search_flights", + response={ + "flights": [ + { + "id": "UA100", + "price": "$300", + } + ] + }, + ) + ) + ], + ), + ), + types.evals.AgentEvent( + author="flight_bot", + content=genai_types.Content( + role="model", + parts=[ + genai_types.Part( + text="I found flight UA100 to NYC for $300." + ) + ], + ), + ), + ], + ), + types.evals.ConversationTurn( + turn_index=1, + events=[ + types.evals.AgentEvent( + author="user", + content=genai_types.Content( + role="user", + parts=[ + genai_types.Part( + text=( + "Great, book that. I also need a hotel" + " there." + ) + ) + ], + ), + ), + types.evals.AgentEvent( + author="coordinator", + content=genai_types.Content( + role="model", + parts=[ + genai_types.Part( + function_call=genai_types.FunctionCall( + name="delegate_to_agent", + args={"agent_name": "hotel_bot"}, + ) + ) + ], + ), + ), + types.evals.AgentEvent( + author="hotel_bot", + content=genai_types.Content( + role="model", + parts=[ + genai_types.Part( + function_call=genai_types.FunctionCall( + name="search_hotels", + args={"location": "NYC"}, + ) + ) + ], + ), + ), + types.evals.AgentEvent( + author="hotel_bot", + content=genai_types.Content( + role="tool", + parts=[ + genai_types.Part( + function_response=genai_types.FunctionResponse( + name="search_hotels", + response={ + "hotels": [ + { + "name": "Central Park Hotel", + "rating": 4.5, + } + ] + }, + ) + ) + ], + ), + ), + types.evals.AgentEvent( + author="hotel_bot", + content=genai_types.Content( + role="model", + parts=[ + genai_types.Part( + text="I recommend the Central Park Hotel." + ) + ], + ), + ), + ], + ), + ], + ) + + # Create the EvalCase and wrap it in an EvaluationDataset + eval_case = types.EvalCase(agent_data=agent_data) + eval_dataset = types.EvaluationDataset(eval_cases=[eval_case]) + + metrics = [ + types.RubricMetric.MULTI_TURN_TRAJECTORY_QUALITY, + ] + + evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics) + + assert isinstance(evaluation_result, types.EvaluationResult) + + assert evaluation_result.summary_metrics is not None + assert len(evaluation_result.summary_metrics) > 0 + for summary in evaluation_result.summary_metrics: + assert isinstance(summary, types.AggregatedMetricResult) + assert summary.metric_name is not None + assert summary.mean_score is not None + + assert evaluation_result.eval_case_results is not None + assert len(evaluation_result.eval_case_results) > 0 + for case_result in evaluation_result.eval_case_results: + assert isinstance(case_result, types.EvalCaseResult) + assert case_result.eval_case_index is not None + assert case_result.response_candidate_results is not None + + pytestmark = pytest_helper.setup( file=__file__, globals_for_file=globals(), diff --git a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py index 3697189f59..2ebabb8641 100644 --- a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py +++ b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py @@ -105,39 +105,6 @@ def test_pointwise_metric(client): assert response.pointwise_metric_result.score is not None -# def test_predefined_metric_with_agent_data(client): -# """Tests the _evaluate_instances method with predefined metric and agent_data.""" -# agent_data = types.evals.AgentData( -# agent_config=types.evals.AgentConfig( -# tools=[ -# genai_types.Tool( -# function_declarations=[ -# genai_types.FunctionDeclaration(name="search") -# ] -# ) -# ], -# developer_instruction=types.evals.InstanceData(text="instruction"), -# ), -# events=types.evals.Events( -# event=[genai_types.Content(parts=[genai_types.Part(text="hello")])] -# ), -# ) -# instance = types.EvaluationInstance( -# prompt=types.evals.InstanceData(text="What is the capital of France?"), -# response=types.evals.InstanceData(text="Paris"), -# reference=types.evals.InstanceData(text="Paris"), -# agent_data=agent_data, -# ) - -# response = client.evals.evaluate_instances( -# metric_config=types._EvaluateInstancesRequestParameters( -# metrics=[types.Metric(name="general_quality_v1")], -# instance=instance, -# ) -# ) -# assert response.metric_results[0].score is not None - - def test_pairwise_metric_with_autorater(client): """Tests the _evaluate_instances method with PairwiseMetricInput and AutoraterConfig.""" diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index 0bc28994ed..f77bca328d 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -1047,8 +1047,10 @@ def _resolve_dataset_inputs( datasets_to_process = dataset logger.info("Processing %s dataset(s).", num_response_candidates) - loaded_raw_datasets: list[list[dict[str, Any]]] = [] - schemas_for_merge: list[str] = [] + if len(datasets_to_process) == 1 and datasets_to_process[0].eval_cases: + return datasets_to_process[0], 1 + + parsed_evaluation_datasets: list[types.EvaluationDataset] = [] for i, ds_item in enumerate(datasets_to_process): if not isinstance(ds_item, types.EvaluationDataset): @@ -1062,9 +1064,13 @@ def _resolve_dataset_inputs( f"Item at index {i} is not an EvaluationDataset: {type(ds_item)}" ) + if ds_item.eval_cases: + logger.info("Dataset %d already contains eval_cases.", i) + parsed_evaluation_datasets.append(ds_item) + continue + ds_source_for_loader = _get_dataset_source(ds_item) current_loaded_data = loader.load(ds_source_for_loader) - loaded_raw_datasets.append(current_loaded_data) if dataset_schema: current_schema = _evals_data_converters.EvalDatasetSchema(dataset_schema) @@ -1072,7 +1078,6 @@ def _resolve_dataset_inputs( current_schema = _evals_data_converters.auto_detect_dataset_schema( # type: ignore[assignment] current_loaded_data ) - schemas_for_merge.append(current_schema) logger.info( "Dataset %d: Schema: %s. Using %s converter.", @@ -1082,13 +1087,12 @@ def _resolve_dataset_inputs( current_schema ).__class__.__name__, ) + converter = _evals_data_converters.get_dataset_converter(current_schema) + parsed_evaluation_datasets.append(converter.convert(current_loaded_data)) - processed_eval_dataset = ( - _evals_data_converters.merge_response_datasets_into_canonical_format( - raw_datasets=loaded_raw_datasets, - schemas=schemas_for_merge, - agent_info=agent_info, - ) + processed_eval_dataset = _evals_data_converters.merge_evaluation_datasets( + datasets=parsed_evaluation_datasets, + agent_info=agent_info, ) if not processed_eval_dataset.eval_cases: diff --git a/vertexai/_genai/_evals_constant.py b/vertexai/_genai/_evals_constant.py index 6fc27d94e0..fc3f3e2771 100644 --- a/vertexai/_genai/_evals_constant.py +++ b/vertexai/_genai/_evals_constant.py @@ -23,6 +23,9 @@ "safety_v1", "multi_turn_general_quality_v1", "multi_turn_text_quality_v1", + "multi_turn_tool_use_quality_v1", + "multi_turn_trajectory_quality_v1", + "multi_turn_task_success_v1", "final_response_match_v2", "final_response_reference_free_v1", "final_response_quality_v1", diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py index 459600caff..876f9a7341 100644 --- a/vertexai/_genai/_evals_data_converters.py +++ b/vertexai/_genai/_evals_data_converters.py @@ -202,24 +202,25 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset: system_instruction_data = item.pop("instruction", None) rubric_groups_data = item.pop("rubric_groups", None) intermediate_events_data = item.pop("intermediate_events", None) + agent_data_raw = item.pop("agent_data", None) - if not response_data: + if not response_data and not agent_data_raw: raise ValueError( "Response is required but missing for %s." % eval_case_id ) - if not prompt_data: + if not prompt_data and not agent_data_raw: raise ValueError( "Prompt is required but missing for %s." % eval_case_id ) - prompt: genai_types.Content + prompt: Optional[genai_types.Content] = None if isinstance(prompt_data, str): prompt = genai_types.Content(parts=[genai_types.Part(text=prompt_data)]) elif isinstance(prompt_data, dict): prompt = genai_types.Content.model_validate(prompt_data) elif isinstance(prompt_data, genai_types.Content): prompt = prompt_data - else: + elif not agent_data_raw: raise ValueError( "Invalid prompt type for case %s: %s" % (i, type(prompt_data)) ) @@ -265,7 +266,7 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset: type(content), ) - responses: list[types.ResponseCandidate] + responses: Optional[list[types.ResponseCandidate]] = None if isinstance(response_data, dict): responses = [ types.ResponseCandidate( @@ -282,7 +283,7 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset: ] elif isinstance(response_data, genai_types.Content): responses = [types.ResponseCandidate(response=response_data)] - else: + elif not agent_data_raw: raise ValueError( "Invalid response type for case %s: %s" % (i, type(response_data)) ) @@ -411,6 +412,41 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset: i, ) + agent_data: Optional[types.evals.AgentData] = None + if agent_data_raw: + if isinstance(agent_data_raw, str): + try: + agent_data_dict = json.loads(agent_data_raw) + agent_data = types.evals.AgentData.model_validate( + agent_data_dict + ) + except json.JSONDecodeError: + logger.warning( + "Could not decode agent_data JSON string for case %s.", i + ) + except ValidationError as e: + logger.warning( + "Failed to validate agent_data for case %s: %s", i, e + ) + elif isinstance(agent_data_raw, dict): + try: + agent_data = types.evals.AgentData.model_validate( + agent_data_raw + ) + except ValidationError as e: + logger.warning( + "Failed to validate agent_data for case %s: %s", i, e + ) + elif isinstance(agent_data_raw, types.evals.AgentData): + agent_data = agent_data_raw + else: + logger.warning( + "Invalid type for agent_data in case %s. Expected str, dict" + " or types.evals.AgentData object. Got %s", + i, + type(agent_data_raw), + ) + eval_case = types.EvalCase( eval_case_id=eval_case_id, prompt=prompt, @@ -420,6 +456,7 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset: system_instruction=system_instruction, rubric_groups=rubric_groups, intermediate_events=intermediate_events, + agent_data=agent_data, **item, # Pass remaining columns as extra fields to EvalCase. # They can be used for custom metric prompt templates. ) @@ -605,6 +642,9 @@ def auto_detect_dataset_schema( if "role" in messages_list[0] and "content" in messages_list[0]: return EvalDatasetSchema.OPENAI + if "agent_data" in keys: + return EvalDatasetSchema.FLATTEN + if {"prompt", "response"}.issubset(keys) or { "response", "reference", @@ -701,51 +741,40 @@ def _validate_case_consistency( ) -def merge_response_datasets_into_canonical_format( - raw_datasets: list[list[dict[str, Any]]], - schemas: list[str], +def merge_evaluation_datasets( + datasets: list[types.EvaluationDataset], agent_info: Optional[types.evals.AgentInfo] = None, ) -> types.EvaluationDataset: - """Merges multiple raw response datasets into a single EvaluationDataset. + """Merges multiple EvaluationDatasets into a single EvaluationDataset. - Assumes that each dataset in raw_datasets has responses corresponding - to the same set of prompts, in the same order. The prompt, reference, - system_instruction, and conversation_history are taken from the first dataset. + Assumes that each dataset has responses corresponding to the same set of + prompts, in the same order. The prompt, reference, system_instruction, and + conversation_history are taken from the first dataset. """ - if not isinstance(raw_datasets, list): - raise TypeError( - "Input 'raw_datasets' must be a list, got %s." % type(raw_datasets) - ) - if not raw_datasets or not all(isinstance(ds, list) for ds in raw_datasets): - raise ValueError( - "Input 'raw_datasets' cannot be empty and must be a list of lists." - ) - if not schemas or len(schemas) != len(raw_datasets): - raise ValueError( - "A list of schemas must be provided, one for each raw dataset. " - "Got %s schemas for %s datasets." % (len(schemas), len(raw_datasets)) - ) + if not datasets: + raise ValueError("Input 'datasets' cannot be empty.") + + num_expected_cases = 0 + if datasets[0].eval_cases: + num_expected_cases = len(datasets[0].eval_cases) - num_expected_cases = len(raw_datasets[0]) if num_expected_cases == 0: logger.warning( "The first dataset has no evaluation cases. Result will be empty." ) return types.EvaluationDataset(eval_cases=[]) - parsed_evaluation_datasets: list[types.EvaluationDataset] = [] - for i, (raw_ds_entry, schema) in enumerate(zip(raw_datasets, schemas)): - if len(raw_ds_entry) != num_expected_cases: + for i, ds in enumerate(datasets): + current_len = len(ds.eval_cases) if ds.eval_cases else 0 + if current_len != num_expected_cases: raise ValueError( "All datasets must have the same number of evaluation cases. " - "Base dataset (0) has %s, but dataset %s (schema: %s) has %s." - % (num_expected_cases, i, schema, len(raw_ds_entry)) + "Base dataset (0) has %s, but dataset %s has %s." + % (num_expected_cases, i, current_len) ) - converter = get_dataset_converter(schema) - parsed_evaluation_datasets.append(converter.convert(raw_ds_entry)) merged_eval_cases: list[types.EvalCase] = [] - base_parsed_dataset = parsed_evaluation_datasets[0] + base_parsed_dataset = datasets[0] for case_idx in range(num_expected_cases): base_eval_case: types.EvalCase = ( @@ -781,9 +810,7 @@ def merge_response_datasets_into_canonical_format( }, exclude_none=True, ) - for dataset_idx_offset, current_parsed_ds in enumerate( - parsed_evaluation_datasets[1:], start=1 - ): + for dataset_idx_offset, current_parsed_ds in enumerate(datasets[1:], start=1): current_ds_eval_case: types.EvalCase = ( current_parsed_ds.eval_cases[case_idx] if current_parsed_ds.eval_cases @@ -838,3 +865,36 @@ def merge_response_datasets_into_canonical_format( merged_eval_cases.append(merged_case) return types.EvaluationDataset(eval_cases=merged_eval_cases) + + +def merge_response_datasets_into_canonical_format( + raw_datasets: list[list[dict[str, Any]]], + schemas: list[str], + agent_info: Optional[types.evals.AgentInfo] = None, +) -> types.EvaluationDataset: + """Merges multiple raw response datasets into a single EvaluationDataset. + + Assumes that each dataset in raw_datasets has responses corresponding + to the same set of prompts, in the same order. The prompt, reference, + system_instruction, and conversation_history are taken from the first dataset. + """ + if not isinstance(raw_datasets, list): + raise TypeError( + "Input 'raw_datasets' must be a list, got %s." % type(raw_datasets) + ) + if not raw_datasets or not all(isinstance(ds, list) for ds in raw_datasets): + raise ValueError( + "Input 'raw_datasets' cannot be empty and must be a list of lists." + ) + if not schemas or len(schemas) != len(raw_datasets): + raise ValueError( + "A list of schemas must be provided, one for each raw dataset. " + "Got %s schemas for %s datasets." % (len(schemas), len(raw_datasets)) + ) + + parsed_evaluation_datasets: list[types.EvaluationDataset] = [] + for i, (raw_ds_entry, schema) in enumerate(zip(raw_datasets, schemas)): + converter = get_dataset_converter(schema) + parsed_evaluation_datasets.append(converter.convert(raw_ds_entry)) + + return merge_evaluation_datasets(parsed_evaluation_datasets, agent_info) diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py index 901556db8a..671471be81 100644 --- a/vertexai/_genai/_evals_metric_handlers.py +++ b/vertexai/_genai/_evals_metric_handlers.py @@ -879,6 +879,9 @@ def _eval_case_to_agent_data( eval_case: types.EvalCase, ) -> Optional[types.evals.AgentData]: """Converts an EvalCase object to an AgentData object.""" + if getattr(eval_case, "agent_data", None): + return eval_case.agent_data + if not eval_case.agent_info and not eval_case.intermediate_events: return None tools = None @@ -920,11 +923,16 @@ def _build_request_payload( self, eval_case: types.EvalCase, response_index: int ) -> dict[str, Any]: """Builds the request parameters for evaluate instances request.""" - if not eval_case.responses or response_index >= len(eval_case.responses): + if ( + not eval_case.responses or response_index >= len(eval_case.responses) + ) and not getattr(eval_case, "agent_data", None): raise IndexError(f"response_index {response_index} is out of bounds.") - response_content = eval_case.responses[response_index].response - if not response_content: + response_content = None + if eval_case.responses and response_index < len(eval_case.responses): + response_content = eval_case.responses[response_index].response + + if not response_content and not getattr(eval_case, "agent_data", None): raise ValueError( f"Response content missing for candidate {response_index}." ) @@ -1442,9 +1450,15 @@ def compute_metrics_and_aggregate( for eval_case_index, eval_case in enumerate( evaluation_run_config.dataset.eval_cases ): + num_responses = ( + len(eval_case.responses) if eval_case.responses else 0 + ) + if num_responses == 0 and getattr(eval_case, "agent_data", None): + num_responses = 1 + actual_num_candidates_for_case = min( evaluation_run_config.num_response_candidates, - len(eval_case.responses), + num_responses, ) for response_index in range(actual_num_candidates_for_case): try: diff --git a/vertexai/_genai/_evals_metric_loaders.py b/vertexai/_genai/_evals_metric_loaders.py index cbe14b160c..a3482eef6e 100644 --- a/vertexai/_genai/_evals_metric_loaders.py +++ b/vertexai/_genai/_evals_metric_loaders.py @@ -299,6 +299,18 @@ def MULTI_TURN_GENERAL_QUALITY(self) -> LazyLoadedPrebuiltMetric: def MULTI_TURN_TEXT_QUALITY(self) -> LazyLoadedPrebuiltMetric: return self.__getattr__("MULTI_TURN_TEXT_QUALITY") + @property + def MULTI_TURN_TOOL_USE_QUALITY(self) -> LazyLoadedPrebuiltMetric: + return self.__getattr__("MULTI_TURN_TOOL_USE_QUALITY", version="v1") + + @property + def MULTI_TURN_TRAJECTORY_QUALITY(self) -> LazyLoadedPrebuiltMetric: + return self.__getattr__("MULTI_TURN_TRAJECTORY_QUALITY", version="v1") + + @property + def MULTI_TURN_TASK_SUCCESS(self) -> LazyLoadedPrebuiltMetric: + return self.__getattr__("MULTI_TURN_TASK_SUCCESS", version="v1") + @property def FINAL_RESPONSE_MATCH(self) -> LazyLoadedPrebuiltMetric: return self.__getattr__("FINAL_RESPONSE_MATCH", version="v2") diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index dba63496fb..f3d29c407d 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -224,7 +224,11 @@ def _EvaluateInstancesRequestParameters_to_vertex( ) if getv(from_object, ["instance"]) is not None: - setv(to_object, ["instance"], getv(from_object, ["instance"])) + setv( + to_object, + ["instance"], + _EvaluationInstance_to_vertex(getv(from_object, ["instance"]), to_object), + ) if getv(from_object, ["config"]) is not None: setv(to_object, ["config"], getv(from_object, ["config"])) @@ -232,6 +236,32 @@ def _EvaluateInstancesRequestParameters_to_vertex( return to_object +def _EvaluationInstance_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["prompt"]) is not None: + setv(to_object, ["prompt"], getv(from_object, ["prompt"])) + + if getv(from_object, ["response"]) is not None: + setv(to_object, ["response"], getv(from_object, ["response"])) + + if getv(from_object, ["reference"]) is not None: + setv(to_object, ["reference"], getv(from_object, ["reference"])) + + if getv(from_object, ["other_data"]) is not None: + setv(to_object, ["otherData"], getv(from_object, ["other_data"])) + + if getv(from_object, ["agent_data"]) is not None: + setv(to_object, ["agent_eval_data"], getv(from_object, ["agent_data"])) + + if getv(from_object, ["rubric_groups"]) is not None: + setv(to_object, ["rubricGroups"], getv(from_object, ["rubric_groups"])) + + return to_object + + def _EvaluationRunConfig_from_vertex( from_object: Union[dict[str, Any], object], parent_object: Optional[dict[str, Any]] = None, diff --git a/vertexai/_genai/types/evals.py b/vertexai/_genai/types/evals.py index 970286cd1d..dd7230da8a 100644 --- a/vertexai/_genai/types/evals.py +++ b/vertexai/_genai/types/evals.py @@ -122,6 +122,10 @@ class AgentConfig(_common.BaseModel): This ID is used to refer to this agent, e.g., in AgentEvent.author, or in the `sub_agents` field. It must be unique within the `agents` map.""", ) + agent_resource_name: Optional[str] = Field( + default=None, + description="""The Agent Engine resource name, formatted as `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`.""", + ) agent_type: Optional[str] = Field( default=None, description="""The type or class of the agent (e.g., "LlmAgent", "RouterAgent", @@ -160,6 +164,51 @@ class AgentConfig(_common.BaseModel): description="""A field containing instructions from the developer for the agent.""", ) + @staticmethod + def _get_tool_declarations_from_agent(agent: Any) -> genai_types.ToolListUnion: + """Gets tool declarations from an agent. + + Args: + agent: The agent to get the tool declarations from. Data type is google.adk.agents.LLMAgent type, use Any to avoid dependency on ADK. + + Returns: + The tool declarations of the agent. + """ + tool_declarations: genai_types.ToolListUnion = [] + for tool in agent.tools: + tool_declarations.append( + { + "function_declarations": [ + genai_types.FunctionDeclaration.from_callable_with_api_option( + callable=tool + ) + ] + } + ) + return tool_declarations + + @classmethod + def from_agent( + cls, agent: Any, agent_resource_name: Optional[str] = None + ) -> "AgentConfig": + """Creates an AgentConfig from an ADK agent object. + + Args: + agent: The agent to get the agent info from, data type is google.adk.agents.LLMAgent type, use Any to avoid dependency on ADK. + agent_resource_name: Optional. The agent engine resource name. + + Returns: + An AgentConfig object populated with the agent's metadata. + """ + return cls( # pytype: disable=missing-parameter + agent_id=getattr(agent, "name", "agent_0") or "agent_0", + agent_resource_name=agent_resource_name, + agent_type=agent.__class__.__name__, + description=getattr(agent, "description", None), + instruction=getattr(agent, "instruction", None), + tools=AgentConfig._get_tool_declarations_from_agent(agent), + ) + class AgentConfigDict(TypedDict, total=False): """Represents configuration for an Agent.""" @@ -169,6 +218,9 @@ class AgentConfigDict(TypedDict, total=False): This ID is used to refer to this agent, e.g., in AgentEvent.author, or in the `sub_agents` field. It must be unique within the `agents` map.""" + agent_resource_name: Optional[str] + """The Agent Engine resource name, formatted as `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`.""" + agent_type: Optional[str] """The type or class of the agent (e.g., "LlmAgent", "RouterAgent", "ToolUseAgent"). Useful for the autorater to understand the expected @@ -334,6 +386,98 @@ class AgentData(_common.BaseModel): ) events: Optional[Events] = Field(default=None, description="""A list of events.""") + @classmethod + def from_session(cls, agent: Any, session_history: list[Any]) -> "AgentData": + """Creates an AgentData object from a session history. + + Segments the flat list of session events into ConversationTurns. A new turn + is initiated by a User message. + + Args: + agent: The agent instance used in the session. + session_history: A list of raw events/messages from the session. + + Returns: + An AgentData object containing the segmented history and agent config. + """ + agent_config = AgentConfig.from_agent(agent) + agent_id = agent_config.agent_id or "agent_0" + agents_map = {agent_id: agent_config} + + turns: list[ConversationTurn] = [] + current_turn_events: list[AgentEvent] = [] + + for event in session_history: + is_user = False + if isinstance(event, dict): + if event.get("role") == "user": + is_user = True + elif ( + isinstance(event.get("content"), dict) + and event["content"].get("role") == "user" + ): + is_user = True + elif hasattr(event, "role") and event.role == "user": + is_user = True + + if is_user and current_turn_events: + turns.append( + ConversationTurn( # pytype: disable=missing-parameter + turn_index=len(turns), + turn_id=f"turn_{len(turns)}", + events=current_turn_events, + ) + ) + current_turn_events = [] + + author = "user" if is_user else agent_id + + content = None + if isinstance(event, dict): + if "content" in event: + raw_content = event["content"] + if isinstance(raw_content, genai_types.Content): + content = raw_content + elif isinstance(raw_content, dict): + try: + content = genai_types.Content.model_validate(raw_content) + except Exception as e: + raise ValueError( + f"Failed to validate Content from dictionary in session history: {raw_content}" + ) from e + elif isinstance(raw_content, str): + content = genai_types.Content( + parts=[genai_types.Part(text=raw_content)] + ) + elif "parts" in event: + try: + content = genai_types.Content.model_validate(event) + except Exception as e: + raise ValueError( + f"Failed to validate Content from event with 'parts': {event}" + ) from e + elif hasattr(event, "content") and isinstance( + event.content, genai_types.Content + ): + content = event.content + + agent_event = AgentEvent( # pytype: disable=missing-parameter + author=author, + content=content, + ) + current_turn_events.append(agent_event) + + if current_turn_events: + turns.append( + ConversationTurn( # pytype: disable=missing-parameter + turn_index=len(turns), + turn_id=f"turn_{len(turns)}", + events=current_turn_events, + ) + ) + + return cls(agents=agents_map, turns=turns) # pytype: disable=missing-parameter + class AgentDataDict(TypedDict, total=False): """Represents data specific to multi-turn agent evaluations."""