Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
259 changes: 259 additions & 0 deletions tests/unit/vertexai/genai/replays/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from tests.unit.vertexai.genai.replays import pytest_helper
from vertexai._genai import types
from google.genai import types as genai_types
import pandas as pd


Expand Down Expand Up @@ -96,6 +97,264 @@ def test_evaluation_byor(client):
assert case_result.response_candidate_results is not None


def test_evaluation_agent_data(client):
"""Tests evaluate method with AgentData."""
client._api_client._http_options.base_url = (
"https://autopush-aiplatform.sandbox.googleapis.com/"
)
client._api_client._http_options.api_version = "v1beta1"

agent_data = types.evals.AgentData(
agents={
"coordinator": types.evals.AgentConfig(
agent_id="coordinator",
agent_type="RouterAgent",
description="Root agent that delegates to specialists.",
instruction=(
"You are a travel coordinator. Delegate flight tasks to"
" 'flight_bot' and hotel tasks to 'hotel_bot'."
),
sub_agents=["flight_bot", "hotel_bot"],
tools=[
genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(
name="delegate_to_agent",
description=("Delegates conversation to a sub-agent."),
)
]
)
],
),
"flight_bot": types.evals.AgentConfig(
agent_id="flight_bot",
agent_type="SpecialistAgent",
description="Handles flight searches.",
instruction="Search for flights using the available tools.",
tools=[
genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(
name="search_flights",
description=(
"Finds flights based on origin and" " destination."
),
)
]
)
],
),
"hotel_bot": types.evals.AgentConfig(
agent_id="hotel_bot",
agent_type="SpecialistAgent",
description="Handles hotel searches.",
instruction="Search for hotels using the available tools.",
tools=[
genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(
name="search_hotels",
description="Finds hotels in a given location.",
)
]
)
],
),
},
turns=[
types.evals.ConversationTurn(
turn_index=0,
events=[
types.evals.AgentEvent(
author="user",
content=genai_types.Content(
role="user",
parts=[
genai_types.Part(
text=(
"I need to book a flight to NYC for next"
" Monday."
)
)
],
),
),
types.evals.AgentEvent(
author="coordinator",
content=genai_types.Content(
role="model",
parts=[
genai_types.Part(
function_call=genai_types.FunctionCall(
name="delegate_to_agent",
args={"agent_name": "flight_bot"},
)
)
],
),
),
types.evals.AgentEvent(
author="flight_bot",
content=genai_types.Content(
role="model",
parts=[
genai_types.Part(
function_call=genai_types.FunctionCall(
name="search_flights",
args={
"destination": "NYC",
"date": "next Monday",
},
)
)
],
),
),
types.evals.AgentEvent(
author="flight_bot",
content=genai_types.Content(
role="tool",
parts=[
genai_types.Part(
function_response=genai_types.FunctionResponse(
name="search_flights",
response={
"flights": [
{
"id": "UA100",
"price": "$300",
}
]
},
)
)
],
),
),
types.evals.AgentEvent(
author="flight_bot",
content=genai_types.Content(
role="model",
parts=[
genai_types.Part(
text="I found flight UA100 to NYC for $300."
)
],
),
),
],
),
types.evals.ConversationTurn(
turn_index=1,
events=[
types.evals.AgentEvent(
author="user",
content=genai_types.Content(
role="user",
parts=[
genai_types.Part(
text=(
"Great, book that. I also need a hotel"
" there."
)
)
],
),
),
types.evals.AgentEvent(
author="coordinator",
content=genai_types.Content(
role="model",
parts=[
genai_types.Part(
function_call=genai_types.FunctionCall(
name="delegate_to_agent",
args={"agent_name": "hotel_bot"},
)
)
],
),
),
types.evals.AgentEvent(
author="hotel_bot",
content=genai_types.Content(
role="model",
parts=[
genai_types.Part(
function_call=genai_types.FunctionCall(
name="search_hotels",
args={"location": "NYC"},
)
)
],
),
),
types.evals.AgentEvent(
author="hotel_bot",
content=genai_types.Content(
role="tool",
parts=[
genai_types.Part(
function_response=genai_types.FunctionResponse(
name="search_hotels",
response={
"hotels": [
{
"name": "Central Park Hotel",
"rating": 4.5,
}
]
},
)
)
],
),
),
types.evals.AgentEvent(
author="hotel_bot",
content=genai_types.Content(
role="model",
parts=[
genai_types.Part(
text="I recommend the Central Park Hotel."
)
],
),
),
],
),
],
)

# Create the EvalCase and wrap it in an EvaluationDataset
eval_case = types.EvalCase(agent_data=agent_data)
eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])

metrics = [
types.RubricMetric.MULTI_TURN_TOOL_USE_QUALITY,
types.RubricMetric.MULTI_TURN_TRAJECTORY_QUALITY,
types.RubricMetric.MULTI_TURN_TASK_SUCCESS,
]

evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)

assert isinstance(evaluation_result, types.EvaluationResult)

assert evaluation_result.summary_metrics is not None
assert len(evaluation_result.summary_metrics) > 0
for summary in evaluation_result.summary_metrics:
assert isinstance(summary, types.AggregatedMetricResult)
assert summary.metric_name is not None
assert summary.mean_score is not None

assert evaluation_result.eval_case_results is not None
assert len(evaluation_result.eval_case_results) > 0
for case_result in evaluation_result.eval_case_results:
assert isinstance(case_result, types.EvalCaseResult)
assert case_result.eval_case_index is not None
assert case_result.response_candidate_results is not None


pytestmark = pytest_helper.setup(
file=__file__,
globals_for_file=globals(),
Expand Down
33 changes: 0 additions & 33 deletions tests/unit/vertexai/genai/replays/test_evaluate_instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,39 +105,6 @@ def test_pointwise_metric(client):
assert response.pointwise_metric_result.score is not None


# def test_predefined_metric_with_agent_data(client):
# """Tests the _evaluate_instances method with predefined metric and agent_data."""
# agent_data = types.evals.AgentData(
# agent_config=types.evals.AgentConfig(
# tools=[
# genai_types.Tool(
# function_declarations=[
# genai_types.FunctionDeclaration(name="search")
# ]
# )
# ],
# developer_instruction=types.evals.InstanceData(text="instruction"),
# ),
# events=types.evals.Events(
# event=[genai_types.Content(parts=[genai_types.Part(text="hello")])]
# ),
# )
# instance = types.EvaluationInstance(
# prompt=types.evals.InstanceData(text="What is the capital of France?"),
# response=types.evals.InstanceData(text="Paris"),
# reference=types.evals.InstanceData(text="Paris"),
# agent_data=agent_data,
# )

# response = client.evals.evaluate_instances(
# metric_config=types._EvaluateInstancesRequestParameters(
# metrics=[types.Metric(name="general_quality_v1")],
# instance=instance,
# )
# )
# assert response.metric_results[0].score is not None


def test_pairwise_metric_with_autorater(client):
"""Tests the _evaluate_instances method with PairwiseMetricInput and AutoraterConfig."""

Expand Down
24 changes: 14 additions & 10 deletions vertexai/_genai/_evals_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1047,8 +1047,10 @@ def _resolve_dataset_inputs(
datasets_to_process = dataset
logger.info("Processing %s dataset(s).", num_response_candidates)

loaded_raw_datasets: list[list[dict[str, Any]]] = []
schemas_for_merge: list[str] = []
if len(datasets_to_process) == 1 and datasets_to_process[0].eval_cases:
return datasets_to_process[0], 1

parsed_evaluation_datasets: list[types.EvaluationDataset] = []

for i, ds_item in enumerate(datasets_to_process):
if not isinstance(ds_item, types.EvaluationDataset):
Expand All @@ -1062,17 +1064,20 @@ def _resolve_dataset_inputs(
f"Item at index {i} is not an EvaluationDataset: {type(ds_item)}"
)

if ds_item.eval_cases:
logger.info("Dataset %d already contains eval_cases.", i)
parsed_evaluation_datasets.append(ds_item)
continue

ds_source_for_loader = _get_dataset_source(ds_item)
current_loaded_data = loader.load(ds_source_for_loader)
loaded_raw_datasets.append(current_loaded_data)

if dataset_schema:
current_schema = _evals_data_converters.EvalDatasetSchema(dataset_schema)
else:
current_schema = _evals_data_converters.auto_detect_dataset_schema( # type: ignore[assignment]
current_loaded_data
)
schemas_for_merge.append(current_schema)

logger.info(
"Dataset %d: Schema: %s. Using %s converter.",
Expand All @@ -1082,13 +1087,12 @@ def _resolve_dataset_inputs(
current_schema
).__class__.__name__,
)
converter = _evals_data_converters.get_dataset_converter(current_schema)
parsed_evaluation_datasets.append(converter.convert(current_loaded_data))

processed_eval_dataset = (
_evals_data_converters.merge_response_datasets_into_canonical_format(
raw_datasets=loaded_raw_datasets,
schemas=schemas_for_merge,
agent_info=agent_info,
)
processed_eval_dataset = _evals_data_converters.merge_evaluation_datasets(
datasets=parsed_evaluation_datasets,
agent_info=agent_info,
)

if not processed_eval_dataset.eval_cases:
Expand Down
3 changes: 3 additions & 0 deletions vertexai/_genai/_evals_constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
"safety_v1",
"multi_turn_general_quality_v1",
"multi_turn_text_quality_v1",
"multi_turn_tool_use_quality_v1",
"multi_turn_trajectory_quality_v1",
"multi_turn_task_success_v1",
"final_response_match_v2",
"final_response_reference_free_v1",
"final_response_quality_v1",
Expand Down
Loading
Loading