From b8d2c9958e5a0ce8c7efba6f6b7071aa650afffe Mon Sep 17 00:00:00 2001 From: A Vertex SDK engineer Date: Tue, 17 Feb 2026 13:58:22 -0800 Subject: [PATCH] feat: Add PromptTemplateData to support `context` and `history` columns when creating Evaluation run from dataframe PiperOrigin-RevId: 871483777 --- .../replays/test_create_evaluation_run.py | 471 +++++++++++++----- vertexai/_genai/_evals_common.py | 132 ++++- vertexai/_genai/_evals_constant.py | 2 + vertexai/_genai/evals.py | 124 ++++- vertexai/_genai/types/__init__.py | 6 + vertexai/_genai/types/common.py | 44 ++ 6 files changed, 622 insertions(+), 157 deletions(-) diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py index cd97ab042c..df125532c2 100644 --- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py +++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py @@ -18,6 +18,7 @@ from vertexai import types from google.genai import types as genai_types import pytest +import pandas as pd GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output" GENERAL_QUALITY_METRIC = types.EvaluationRunMetric( @@ -63,29 +64,48 @@ ) ), ) +TOOL = genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="get_weather", + description="Get weather in a location", + parameters={ + "type": "object", + "properties": {"location": {"type": "string"}}, + }, + ) + ] +) +AGENT_INFO = types.evals.AgentInfo( + agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456", + name="agent-1", + instruction="agent-1 instruction", + tool_declarations=[TOOL], +) +DEFAULT_PROMPT_TEMPLATE = "{prompt}" +INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd.DataFrame( + { + "prompt": ["prompt1", "prompt2"], + "reference": ["reference1", "reference2"], + "response": ["response1", "response2"], + "context": ["context1", "context2"], + "conversation_history": ["history1", "history2"], + } +) +CANDIDATE_NAME = "candidate_1" +MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash" +EVAL_SET_NAME = ( + "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" +) def test_create_eval_run_data_source_evaluation_set(client): """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" client._api_client._http_options.api_version = "v1beta1" - tool = genai_types.Tool( - function_declarations=[ - genai_types.FunctionDeclaration( - name="get_weather", - description="Get weather in a location", - parameters={ - "type": "object", - "properties": {"location": {"type": "string"}}, - }, - ) - ] - ) evaluation_run = client.evals.create_evaluation_run( name="test4", display_name="test4", - dataset=types.EvaluationRunDataSource( - evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" - ), + dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME), dest=GCS_DEST, metrics=[ GENERAL_QUALITY_METRIC, @@ -94,21 +114,14 @@ def test_create_eval_run_data_source_evaluation_set(client): EXACT_MATCH_COMPUTATION_BASED_METRIC, BLEU_COMPUTATION_BASED_METRIC, ], - agent_info=types.evals.AgentInfo( - agent_resource_name="project/123/locations/us-central1/reasoningEngines/456", - name="agent-1", - instruction="agent-1 instruction", - tool_declarations=[tool], - ), + agent_info=AGENT_INFO, labels={"label1": "value1"}, ) assert isinstance(evaluation_run, types.EvaluationRun) assert evaluation_run.display_name == "test4" assert evaluation_run.state == types.EvaluationRunState.PENDING assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) - assert evaluation_run.data_source.evaluation_set == ( - "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" - ) + assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME assert evaluation_run.evaluation_config == types.EvaluationRunConfig( output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) @@ -122,13 +135,13 @@ def test_create_eval_run_data_source_evaluation_set(client): ], ) assert evaluation_run.inference_configs[ - "agent-1" + AGENT_INFO.name ] == types.EvaluationRunInferenceConfig( agent_config=types.EvaluationRunAgentConfig( developer_instruction=genai_types.Content( parts=[genai_types.Part(text="agent-1 instruction")] ), - tools=[tool], + tools=[TOOL], ) ) assert evaluation_run.labels == { @@ -190,14 +203,15 @@ def test_create_eval_run_with_inference_configs(client): """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs.""" client._api_client._http_options.api_version = "v1beta1" inference_config = types.EvaluationRunInferenceConfig( - model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash" + model=MODEL_NAME, + prompt_template=types.EvaluationRunPromptTemplate( + prompt_template="test prompt template" + ), ) evaluation_run = client.evals.create_evaluation_run( name="test_inference_config", display_name="test_inference_config", - dataset=types.EvaluationRunDataSource( - evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" - ), + dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME), dest=GCS_DEST, metrics=[GENERAL_QUALITY_METRIC], inference_configs={"model_1": inference_config}, @@ -207,9 +221,7 @@ def test_create_eval_run_with_inference_configs(client): assert evaluation_run.display_name == "test_inference_config" assert evaluation_run.state == types.EvaluationRunState.PENDING assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) - assert evaluation_run.data_source.evaluation_set == ( - "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" - ) + assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME assert evaluation_run.evaluation_config == types.EvaluationRunConfig( output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) @@ -223,102 +235,294 @@ def test_create_eval_run_with_inference_configs(client): assert evaluation_run.error is None -# Test fails in replay mode because of UUID generation mismatch. -# def test_create_eval_run_data_source_evaluation_dataset(client): -# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset.""" -# input_df = pd.DataFrame( -# { -# "prompt": ["prompt1", "prompt2"], -# "reference": ["reference1", "reference2"], -# "response": ["response1", "response2"], -# "intermediate_events": [ -# [ -# { -# "content": { -# "parts": [ -# {"text": "first user input"}, -# ], -# "role": "user", -# }, -# }, -# { -# "content": { -# "parts": [ -# {"text": "first model response"}, -# ], -# "role": "model", -# }, -# }, -# ], -# [ -# { -# "content": { -# "parts": [ -# {"text": "second user input"}, -# ], -# "role": "user", -# }, -# }, -# { -# "content": { -# "parts": [ -# {"text": "second model response"}, -# ], -# "role": "model", -# }, -# }, -# ], -# ], -# } -# ) -# evaluation_run = client.evals.create_evaluation_run( -# name="test6", -# display_name="test6", -# dataset=types.EvaluationDataset( -# candidate_name="candidate_1", -# eval_dataset_df=input_df, -# ), -# dest=GCS_DEST, -# metrics=[GENERAL_QUALITY_METRIC], -# ) -# assert isinstance(evaluation_run, types.EvaluationRun) -# assert evaluation_run.display_name == "test6" -# assert evaluation_run.state == types.EvaluationRunState.PENDING -# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) -# # Check evaluation set -# assert evaluation_run.data_source.evaluation_set -# eval_set = client.evals.get_evaluation_set( -# name=evaluation_run.data_source.evaluation_set -# ) -# assert len(eval_set.evaluation_items) == 2 -# # Check evaluation items -# for i, eval_item_name in enumerate(eval_set.evaluation_items): -# eval_item = client.evals.get_evaluation_item(name=eval_item_name) -# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST -# assert eval_item.evaluation_request.prompt.text == input_df.iloc[i]["prompt"] -# assert ( -# eval_item.evaluation_request.candidate_responses[0].text -# == input_df.iloc[i]["response"] -# ) -# assert ( -# eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text -# == input_df.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"] -# ) -# assert ( -# eval_item.evaluation_request.candidate_responses[0].events[0].role -# == input_df.iloc[i]["intermediate_events"][0]["content"]["role"] -# ) -# assert ( -# eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text -# == input_df.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"] -# ) -# assert ( -# eval_item.evaluation_request.candidate_responses[0].events[1].role -# == input_df.iloc[i]["intermediate_events"][1]["content"]["role"] -# ) -# assert evaluation_run.error is None +# Dataframe tests fail in replay mode because of UUID generation mismatch. +def test_create_eval_run_data_source_evaluation_dataset(client): + """Tests that create_evaluation_run() creates a correctly structured + EvaluationRun with EvaluationDataset. + """ + input_df = pd.DataFrame( + { + "prompt": ["prompt1", "prompt2"], + "reference": ["reference1", "reference2"], + "response": ["response1", "response2"], + "intermediate_events": [ + [ + { + "content": { + "parts": [ + {"text": "first user input"}, + ], + "role": "user", + }, + }, + { + "content": { + "parts": [ + {"text": "first model response"}, + ], + "role": "model", + }, + }, + ], + [ + { + "content": { + "parts": [ + {"text": "second user input"}, + ], + "role": "user", + }, + }, + { + "content": { + "parts": [ + {"text": "second model response"}, + ], + "role": "model", + }, + }, + ], + ], + } + ) + evaluation_run = client.evals.create_evaluation_run( + name="test6", + display_name="test6", + dataset=types.EvaluationDataset( + candidate_name=CANDIDATE_NAME, + eval_dataset_df=input_df, + ), + dest=GCS_DEST, + metrics=[GENERAL_QUALITY_METRIC], + ) + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test6" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + # Check evaluation set + assert evaluation_run.data_source.evaluation_set + eval_set = client.evals.get_evaluation_set( + name=evaluation_run.data_source.evaluation_set + ) + assert len(eval_set.evaluation_items) == 2 + # Check evaluation items + for i, eval_item_name in enumerate(eval_set.evaluation_items): + eval_item = client.evals.get_evaluation_item(name=eval_item_name) + assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST + assert eval_item.evaluation_request.prompt.text == input_df.iloc[i]["prompt"] + assert ( + eval_item.evaluation_request.candidate_responses[0].text + == input_df.iloc[i]["response"] + ) + assert ( + eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text + == input_df.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"] + ) + assert ( + eval_item.evaluation_request.candidate_responses[0].events[0].role + == input_df.iloc[i]["intermediate_events"][0]["content"]["role"] + ) + assert ( + eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text + == input_df.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"] + ) + assert ( + eval_item.evaluation_request.candidate_responses[0].events[1].role + == input_df.iloc[i]["intermediate_events"][1]["content"]["role"] + ) + assert evaluation_run.error is None +def test_create_eval_run_data_source_evaluation_dataset_with_inference_configs_and_prompt_template_data( + client, +): + """Tests that create_evaluation_run() creates a correctly structured + EvaluationRun with EvaluationDataset and inference_configs. + Prompt template data is inferred from the dataset and a default prompt + template should be used. + """ + evaluation_run = client.evals.create_evaluation_run( + name="test9", + display_name="test9", + dataset=types.EvaluationDataset( + candidate_name=CANDIDATE_NAME, + eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY, + ), + dest=GCS_DEST, + metrics=[GENERAL_QUALITY_METRIC], + inference_configs={ + CANDIDATE_NAME: types.EvaluationRunInferenceConfig( + model=MODEL_NAME, + ) + }, + ) + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test9" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + assert evaluation_run.inference_configs[ + CANDIDATE_NAME + ] == types.EvaluationRunInferenceConfig( + model=MODEL_NAME, + prompt_template=types.EvaluationRunPromptTemplate( + prompt_template=DEFAULT_PROMPT_TEMPLATE + ), + ) + # Check evaluation set + assert evaluation_run.data_source.evaluation_set + eval_set = client.evals.get_evaluation_set( + name=evaluation_run.data_source.evaluation_set + ) + assert len(eval_set.evaluation_items) == 2 + # Check evaluation items + for i, eval_item_name in enumerate(eval_set.evaluation_items): + eval_item = client.evals.get_evaluation_item(name=eval_item_name) + assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST + assert ( + eval_item.evaluation_request.prompt.prompt_template_data.values[ + "prompt" + ] + == genai_types.Content( + parts=[ + genai_types.Part( + text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"] + ) + ], + role="user", + ) + ) + assert ( + eval_item.evaluation_request.prompt.prompt_template_data.values[ + "context" + ] + == genai_types.Content( + parts=[ + genai_types.Part( + text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"] + ) + ], + role="user", + ) + ) + assert ( + eval_item.evaluation_request.prompt.prompt_template_data.values[ + "conversation_history" + ] + == genai_types.Content( + parts=[ + genai_types.Part( + text=( + INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][ + "conversation_history" + ] + ) + ) + ], + role="user", + ) + ) + assert ( + eval_item.evaluation_request.candidate_responses[0].text + == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"] + ) + assert evaluation_run.error is None + + +def test_create_eval_run_data_source_evaluation_dataset_with_agent_info_and_prompt_template_data( + client, +): + """Tests that create_evaluation_run() creates a correctly structured + EvaluationRun with EvaluationDataset and agent_info. + Prompt template data is inferred from the dataset and a default prompt + template should be used. + """ + evaluation_run = client.evals.create_evaluation_run( + name="test9", + display_name="test9", + dataset=types.EvaluationDataset( + candidate_name=CANDIDATE_NAME, + eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY, + ), + dest=GCS_DEST, + metrics=[GENERAL_QUALITY_METRIC], + agent_info=AGENT_INFO, + ) + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test9" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + assert evaluation_run.inference_configs[ + AGENT_INFO.name + ] == types.EvaluationRunInferenceConfig( + agent_config=types.EvaluationRunAgentConfig( + developer_instruction=genai_types.Content( + parts=[genai_types.Part(text=AGENT_INFO.instruction)] + ), + tools=[TOOL], + ), + prompt_template=types.EvaluationRunPromptTemplate( + prompt_template=DEFAULT_PROMPT_TEMPLATE + ), + ) + # Check evaluation set + assert evaluation_run.data_source.evaluation_set + eval_set = client.evals.get_evaluation_set( + name=evaluation_run.data_source.evaluation_set + ) + assert len(eval_set.evaluation_items) == 2 + # Check evaluation items + for i, eval_item_name in enumerate(eval_set.evaluation_items): + eval_item = client.evals.get_evaluation_item(name=eval_item_name) + assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST + assert ( + eval_item.evaluation_request.prompt.prompt_template_data.values[ + "prompt" + ] + == genai_types.Content( + parts=[ + genai_types.Part( + text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"] + ) + ], + role="user", + ) + ) + assert ( + eval_item.evaluation_request.prompt.prompt_template_data.values[ + "context" + ] + == genai_types.Content( + parts=[ + genai_types.Part( + text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"] + ) + ], + role="user", + ) + ) + assert ( + eval_item.evaluation_request.prompt.prompt_template_data.values[ + "conversation_history" + ] + == genai_types.Content( + parts=[ + genai_types.Part( + text=( + INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][ + "conversation_history" + ] + ) + ) + ], + role="user", + ) + ) + assert ( + eval_item.evaluation_request.candidate_responses[0].text + == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"] + ) + assert evaluation_run.error is None + pytest_plugins = ("pytest_asyncio",) @@ -371,14 +575,15 @@ async def test_create_eval_run_async_with_inference_configs(client): """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously.""" client._api_client._http_options.api_version = "v1beta1" inference_config = types.EvaluationRunInferenceConfig( - model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash" + model=MODEL_NAME, + prompt_template=types.EvaluationRunPromptTemplate( + prompt_template="Test the {prompt}" + ), ) evaluation_run = await client.aio.evals.create_evaluation_run( name="test_inference_config_async", display_name="test_inference_config_async", - dataset=types.EvaluationRunDataSource( - evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" - ), + dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME), dest=GCS_DEST, metrics=[GENERAL_QUALITY_METRIC], inference_configs={"model_1": inference_config}, @@ -388,9 +593,7 @@ async def test_create_eval_run_async_with_inference_configs(client): assert evaluation_run.display_name == "test_inference_config_async" assert evaluation_run.state == types.EvaluationRunState.PENDING assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) - assert evaluation_run.data_source.evaluation_set == ( - "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800" - ) + assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME assert evaluation_run.evaluation_config == types.EvaluationRunConfig( output_config=genai_types.OutputConfig( gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index 0bc28994ed..3a6b11ddf2 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -277,13 +277,51 @@ def _resolve_dataset( return dataset +def _get_default_prompt_template( + api_client: BaseApiClient, + inference_config: types.EvaluationRunInferenceConfigOrDict, + dataset: types.EvaluationRunDataSource, +) -> Any: + """Resolves prompt template data for the evaluation run.""" + if isinstance(inference_config, dict): + if inference_config.get("prompt_template"): + return inference_config["prompt_template"] + elif inference_config.prompt_template: + return inference_config.prompt_template + + try: + evals_module = evals.Evals(api_client_=api_client) + eval_set = evals_module.get_evaluation_set(name=dataset.evaluation_set) + if eval_set and eval_set.evaluation_items: + eval_item = evals_module.get_evaluation_item( + name=eval_set.evaluation_items[0] + ) + if ( + eval_item + and eval_item.evaluation_request + and eval_item.evaluation_request.prompt + and eval_item.evaluation_request.prompt.prompt_template_data + ): + if ( + "prompt" + in eval_item.evaluation_request.prompt.prompt_template_data.values + ): + return "{prompt}" + except Exception as e: + logger.warning("Failed to get prompt template from evaluation set: %s", e) + return None + + def _resolve_inference_configs( + api_client: BaseApiClient, + dataset: types.EvaluationRunDataSource, inference_configs: Optional[ dict[str, types.EvaluationRunInferenceConfigOrDict] ] = None, agent_info_pydantic: Optional[types.evals.AgentInfo] = None, ) -> Optional[dict[str, types.EvaluationRunInferenceConfigOrDict]]: """Resolves inference configs for the evaluation run.""" + # Resolve agent config if agent_info_pydantic and agent_info_pydantic.name: inference_configs = {} inference_configs[agent_info_pydantic.name] = ( @@ -296,13 +334,41 @@ def _resolve_inference_configs( ) ) ) + # Resolve prompt template data + if inference_configs: + for inference_config in inference_configs.values(): + prompt_template_val = ( + inference_config.get("prompt_template") + if isinstance(inference_config, dict) + else inference_config.prompt_template + ) + if not prompt_template_val: + default_prompt_template = _get_default_prompt_template( + api_client, inference_config, dataset + ) + if default_prompt_template: + prompt_template_to_set = default_prompt_template + if not isinstance( + default_prompt_template, types.EvaluationRunPromptTemplate + ): + prompt_template_to_set = types.EvaluationRunPromptTemplate( + prompt_template=default_prompt_template + ) + if isinstance(inference_config, dict): + inference_config[ + "prompt_template" + ] = prompt_template_to_set.model_dump(exclude_none=True) + else: + inference_config.prompt_template = ( + prompt_template_to_set.model_dump(exclude_none=True) + ) return inference_configs def _add_evaluation_run_labels( labels: Optional[dict[str, str]] = None, agent_info_pydantic: Optional[types.evals.AgentInfo] = None, -) -> Optional[dict[str, str]]: +): """Adds labels to the evaluation run.""" if agent_info_pydantic and agent_info_pydantic.agent_resource_name: labels = labels or {} @@ -324,8 +390,7 @@ def _get_candidate_name( and dataset.candidate_name != agent_info_pydantic.name ): logger.warning( - "Evaluation dataset candidate_name and agent_info.name are different." - " Please make sure this is intended." + "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended." ) elif dataset.candidate_name is None and agent_info_pydantic: return agent_info_pydantic.name @@ -1944,6 +2009,21 @@ def _object_to_dict(obj: Any) -> Union[dict[str, Any], Any]: return result +def _get_content(row: dict[str, Any], column: str): + if isinstance(row[column], str): + return genai_types.Content( + parts=[genai_types.Part(text=row[column])], + role=_evals_constant.USER_AUTHOR, + ) + elif isinstance(row[column], genai_types.Content): + return row[column] + else: + raise ValueError( + f"{column} must be a string or a Content object. " + f"Got {type(row[column])}." + ) + + def _create_evaluation_set_from_dataframe( api_client: BaseApiClient, gcs_dest_prefix: str, @@ -1962,29 +2042,43 @@ def _create_evaluation_set_from_dataframe( for event in row[_evals_constant.INTERMEDIATE_EVENTS]: if CONTENT in event: intermediate_events.append(event[CONTENT]) + if _evals_constant.CONTEXT in row or _evals_constant.HISTORY in row: + values = {} + if _evals_constant.CONTEXT in row: + values[_evals_constant.CONTEXT] = _get_content( + row, _evals_constant.CONTEXT + ) + if _evals_constant.HISTORY in row: + values[_evals_constant.HISTORY] = _get_content( + row, _evals_constant.HISTORY + ) + if _evals_constant.PROMPT in row: + values[_evals_constant.PROMPT] = _get_content( + row, _evals_constant.PROMPT + ) + prompt = types.EvaluationPrompt( + prompt_template_data=types.PromptTemplateData(values=values) + ) + elif _evals_constant.PROMPT in row: + prompt = types.EvaluationPrompt(text=row[_evals_constant.PROMPT]) + candidate_responses = [] + if _evals_constant.RESPONSE in row: + candidate_responses.append( + types.CandidateResponse( + candidate=candidate_name or "Candidate 1", + text=row[_evals_constant.RESPONSE], + events=intermediate_events or None, + ) + ) eval_item_requests.append( types.EvaluationItemRequest( - prompt=( - types.EvaluationPrompt(text=row[_evals_constant.PROMPT]) - if _evals_constant.PROMPT in row - else None - ), + prompt=prompt or None, golden_response=( types.CandidateResponse(text=row[_evals_constant.REFERENCE]) if _evals_constant.REFERENCE in row else None ), - candidate_responses=[ - types.CandidateResponse( - candidate=candidate_name or "Candidate 1", - text=row.get(_evals_constant.RESPONSE, None), - events=( - intermediate_events - if len(intermediate_events) > 0 - else None - ), - ) - ], + candidate_responses=candidate_responses or None, ) ) logger.info("Writing evaluation item requests to GCS.") diff --git a/vertexai/_genai/_evals_constant.py b/vertexai/_genai/_evals_constant.py index 6fc27d94e0..847140dc5c 100644 --- a/vertexai/_genai/_evals_constant.py +++ b/vertexai/_genai/_evals_constant.py @@ -53,6 +53,7 @@ CONTENT = "content" PARTS = "parts" USER_AUTHOR = "user" +HISTORY = "conversation_history" COMMON_DATASET_COLUMNS = frozenset( { @@ -61,5 +62,6 @@ REFERENCE, SESSION_INPUT, CONTEXT, + HISTORY, } ) diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index bbf5a56c56..bf9f932697 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -94,7 +94,12 @@ def _CreateEvaluationRunParameters_to_vertex( setv(to_object, ["labels"], getv(from_object, ["labels"])) if getv(from_object, ["inference_configs"]) is not None: - setv(to_object, ["inferenceConfigs"], getv(from_object, ["inference_configs"])) + inference_configs = getv(from_object, ["inference_configs"]) + vertex_inference_configs = { + k: _EvaluationRunInferenceConfig_to_vertex(v) + for k, v in inference_configs.items() + } + setv(to_object, ["inferenceConfigs"], vertex_inference_configs) if getv(from_object, ["config"]) is not None: setv(to_object, ["config"], getv(from_object, ["config"])) @@ -254,6 +259,9 @@ def _EvaluationRunConfig_from_vertex( if getv(from_object, ["autoraterConfig"]) is not None: setv(to_object, ["autorater_config"], getv(from_object, ["autoraterConfig"])) + if getv(from_object, ["promptTemplate"]) is not None: + setv(to_object, ["prompt_template"], getv(from_object, ["promptTemplate"])) + return to_object @@ -278,6 +286,15 @@ def _EvaluationRunConfig_to_vertex( if getv(from_object, ["autorater_config"]) is not None: setv(to_object, ["autoraterConfig"], getv(from_object, ["autorater_config"])) + if getv(from_object, ["prompt_template"]) is not None: + setv( + to_object, + ["promptTemplate"], + _EvaluationRunPromptTemplate_to_vertex( + getv(from_object, ["prompt_template"]) + ), + ) + return to_object @@ -370,7 +387,12 @@ def _EvaluationRun_from_vertex( ) if getv(from_object, ["inferenceConfigs"]) is not None: - setv(to_object, ["inference_configs"], getv(from_object, ["inferenceConfigs"])) + inference_configs = getv(from_object, ["inferenceConfigs"]) + vertex_inference_configs = { + k: _EvaluationRunInferenceConfig_from_vertex(v) + for k, v in inference_configs.items() + } + setv(to_object, ["inference_configs"], vertex_inference_configs) if getv(from_object, ["labels"]) is not None: setv(to_object, ["labels"], getv(from_object, ["labels"])) @@ -538,6 +560,99 @@ def _RubricBasedMetricSpec_to_vertex( return to_object +def _EvaluationRunPromptTemplate_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["prompt_template"]) is not None: + setv(to_object, ["promptTemplate"], getv(from_object, ["prompt_template"])) + if getv(from_object, ["gcs_uri"]) is not None: + setv(to_object, ["gcsUri"], getv(from_object, ["gcs_uri"])) + return to_object + + +def _EvaluationRunPromptTemplate_from_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["promptTemplate"]) is not None: + setv(to_object, ["prompt_template"], getv(from_object, ["promptTemplate"])) + if getv(from_object, ["gcsUri"]) is not None: + setv(to_object, ["gcs_uri"], getv(from_object, ["gcsUri"])) + return to_object + + +def _EvaluationRunAgentConfig_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["developer_instruction"]) is not None: + setv(to_object, ["developerInstruction"], getv(from_object, ["developer_instruction"])) + if getv(from_object, ["tools"]) is not None: + setv(to_object, ["tools"], getv(from_object, ["tools"])) + return to_object + + +def _EvaluationRunAgentConfig_from_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["developerInstruction"]) is not None: + setv(to_object, ["developer_instruction"], getv(from_object, ["developerInstruction"])) + if getv(from_object, ["tools"]) is not None: + setv(to_object, ["tools"], getv(from_object, ["tools"])) + return to_object + + +def _EvaluationRunInferenceConfig_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["model"]) is not None: + setv(to_object, ["model"], getv(from_object, ["model"])) + if getv(from_object, ["prompt_template"]) is not None: + setv( + to_object, + ["promptTemplate"], + _EvaluationRunPromptTemplate_to_vertex(getv(from_object, ["prompt_template"])), + ) + if getv(from_object, ["agent_config"]) is not None: + setv(to_object, ["agentConfig"], _EvaluationRunAgentConfig_to_vertex(getv(from_object, ["agent_config"]), to_object)) + if getv(from_object, ["model"]) is not None: + setv(to_object, ["model"], getv(from_object, ["model"])) + if getv(from_object, ["user_simulator_config"]) is not None: + setv(to_object, ["userSimulatorConfig"], getv(from_object, ["user_simulator_config"])) + return to_object + + +def _EvaluationRunInferenceConfig_from_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + """Converts a Vertex InferenceConfig response to SDK format.""" + to_object: dict[str, Any] = {} + if getv(from_object, ["model"]) is not None: + setv(to_object, ["model"], getv(from_object, ["model"])) + if getv(from_object, ["promptTemplate"]) is not None: + setv( + to_object, + ["prompt_template"], + _EvaluationRunPromptTemplate_from_vertex(getv(from_object, ["promptTemplate"])), + ) + if getv(from_object, ["agentConfig"]) is not None: + setv(to_object, ["agent_config"], _EvaluationRunAgentConfig_from_vertex(getv(from_object, ["agentConfig"]), to_object)) + if getv(from_object, ["model"]) is not None: + setv(to_object, ["model"], getv(from_object, ["model"])) + if getv(from_object, ["userSimulatorConfig"]) is not None: + setv(to_object, ["user_simulator_config"], getv(from_object, ["userSimulatorConfig"])) + return to_object + + def _RubricGenerationSpec_to_vertex( from_object: Union[dict[str, Any], object], parent_object: Optional[dict[str, Any]] = None, @@ -778,6 +893,7 @@ def _create_evaluation_run( request_dict = _common.convert_to_dict(request_dict) request_dict = _common.encode_unserializable_types(request_dict) + logger.info("request_dict: %s", request_dict) response = self._api_client.request("post", path, request_dict, http_options) @@ -1732,7 +1848,7 @@ def create_evaluation_run( output_config=output_config, metrics=resolved_metrics ) resolved_inference_configs = _evals_common._resolve_inference_configs( - inference_configs, agent_info_pydantic + self._api_client, resolved_dataset, inference_configs, agent_info_pydantic ) resolved_labels = _evals_common._add_evaluation_run_labels( labels, agent_info_pydantic @@ -2660,7 +2776,7 @@ async def create_evaluation_run( output_config=output_config, metrics=resolved_metrics ) resolved_inference_configs = _evals_common._resolve_inference_configs( - inference_configs, agent_info_pydantic + self._api_client, resolved_dataset, inference_configs, agent_info_pydantic ) resolved_labels = _evals_common._add_evaluation_run_labels( labels, agent_info_pydantic diff --git a/vertexai/_genai/types/__init__.py b/vertexai/_genai/types/__init__.py index d3a933f84d..4049382811 100644 --- a/vertexai/_genai/types/__init__.py +++ b/vertexai/_genai/types/__init__.py @@ -387,6 +387,9 @@ from .common import EvaluationRunMetricDict from .common import EvaluationRunMetricOrDict from .common import EvaluationRunOrDict +from .common import EvaluationRunPromptTemplate +from .common import EvaluationRunPromptTemplateDict +from .common import EvaluationRunPromptTemplateOrDict from .common import EvaluationRunResults from .common import EvaluationRunResultsDict from .common import EvaluationRunResultsOrDict @@ -1298,6 +1301,9 @@ "EvaluationRunMetric", "EvaluationRunMetricDict", "EvaluationRunMetricOrDict", + "EvaluationRunPromptTemplate", + "EvaluationRunPromptTemplateDict", + "EvaluationRunPromptTemplateOrDict", "EvaluationRunConfig", "EvaluationRunConfigDict", "EvaluationRunConfigOrDict", diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index a4ab38e698..2d9fa4c22a 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -2287,6 +2287,38 @@ class EvaluationRunMetricDict(TypedDict, total=False): EvaluationRunMetricOrDict = Union[EvaluationRunMetric, EvaluationRunMetricDict] +class EvaluationRunPromptTemplate(_common.BaseModel): + """Prompt template used for inference.""" + + prompt_template: Optional[str] = Field( + default=None, + description="""Inline prompt template. Template variables should be in the format + "{var_name}".""", + ) + gcs_uri: Optional[str] = Field( + default=None, + description="""Prompt template stored in Cloud Storage. Format: + "gs://my-bucket/file-name.txt".""", + ) + + +class EvaluationRunPromptTemplateDict(TypedDict, total=False): + """Prompt template used for inference.""" + + prompt_template: Optional[str] + """Inline prompt template. Template variables should be in the format + "{var_name}".""" + + gcs_uri: Optional[str] + """Prompt template stored in Cloud Storage. Format: + "gs://my-bucket/file-name.txt".""" + + +EvaluationRunPromptTemplateOrDict = Union[ + EvaluationRunPromptTemplate, EvaluationRunPromptTemplateDict +] + + class EvaluationRunConfig(_common.BaseModel): """The evaluation configuration used for the evaluation run.""" @@ -2300,6 +2332,9 @@ class EvaluationRunConfig(_common.BaseModel): autorater_config: Optional[genai_types.AutoraterConfig] = Field( default=None, description="""The autorater config for the evaluation run.""" ) + prompt_template: Optional[EvaluationRunPromptTemplate] = Field( + default=None, description="""The prompt template used for inference.""" + ) class EvaluationRunConfigDict(TypedDict, total=False): @@ -2314,6 +2349,9 @@ class EvaluationRunConfigDict(TypedDict, total=False): autorater_config: Optional[genai_types.AutoraterConfigDict] """The autorater config for the evaluation run.""" + prompt_template: Optional[EvaluationRunPromptTemplateDict] + """The prompt template used for inference.""" + EvaluationRunConfigOrDict = Union[EvaluationRunConfig, EvaluationRunConfigDict] @@ -3039,6 +3077,9 @@ class EvaluationRunInferenceConfig(_common.BaseModel): default=None, description="""The fully qualified name of the publisher model or endpoint to use for inference.""", ) + prompt_template: Optional[EvaluationRunPromptTemplate] = Field( + default=None, description="""The prompt template used for inference.""" + ) user_simulator_config: Optional[evals_types.UserSimulatorConfig] = Field( default=None, description="""Used for multi-turn agent run. @@ -3059,6 +3100,9 @@ class EvaluationRunInferenceConfigDict(TypedDict, total=False): model: Optional[str] """The fully qualified name of the publisher model or endpoint to use for inference.""" + prompt_template: Optional[EvaluationRunPromptTemplateDict] + """The prompt template used for inference.""" + user_simulator_config: Optional[evals_types.UserSimulatorConfig] """Used for multi-turn agent run. Contains configuration for a user simulator that