googleapis
diff --git a/‎tests/unit/vertexai/genai/replays/test_create_evaluation_run.py‎
Lines changed: 246 additions & 43 deletions b/‎tests/unit/vertexai/genai/replays/test_create_evaluation_run.py‎
Lines changed: 246 additions & 43 deletions
@@ -18,6 +18,7 @@
 from vertexai import types
 from google.genai import types as genai_types
 import pytest
+import pandas as pd
 
 GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
 GENERAL_QUALITY_METRIC = types.EvaluationRunMetric(
@@ -63,29 +64,48 @@
         )
     ),
 )
+TOOL = genai_types.Tool(
+    function_declarations=[
+        genai_types.FunctionDeclaration(
+            name="get_weather",
+            description="Get weather in a location",
+            parameters={
+                "type": "object",
+                "properties": {"location": {"type": "string"}},
+            },
+        )
+    ]
+)
+AGENT_INFO = types.evals.AgentInfo(
+    agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456",
+    name="agent-1",
+    instruction="agent-1 instruction",
+    tool_declarations=[TOOL],
+)
+DEFAULT_PROMPT_TEMPLATE = "{prompt}"
+INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd.DataFrame(
+    {
+        "prompt": ["prompt1", "prompt2"],
+        "reference": ["reference1", "reference2"],
+        "response": ["response1", "response2"],
+        "context": ["context1", "context2"],
+        "conversation_history": ["history1", "history2"],
+    }
+)
+CANDIDATE_NAME = "candidate_1"
+MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+EVAL_SET_NAME = (
+    "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+)
 
 
 def test_create_eval_run_data_source_evaluation_set(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
     client._api_client._http_options.api_version = "v1beta1"
-    tool = genai_types.Tool(
-        function_declarations=[
-            genai_types.FunctionDeclaration(
-                name="get_weather",
-                description="Get weather in a location",
-                parameters={
-                    "type": "object",
-                    "properties": {"location": {"type": "string"}},
-                },
-            )
-        ]
-    )
     evaluation_run = client.evals.create_evaluation_run(
         name="test4",
         display_name="test4",
-        dataset=types.EvaluationRunDataSource(
-            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-        ),
+        dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
         dest=GCS_DEST,
         metrics=[
             GENERAL_QUALITY_METRIC,
@@ -94,21 +114,14 @@ def test_create_eval_run_data_source_evaluation_set(client):
             EXACT_MATCH_COMPUTATION_BASED_METRIC,
             BLEU_COMPUTATION_BASED_METRIC,
         ],
-        agent_info=types.evals.AgentInfo(
-            agent_resource_name="project/123/locations/us-central1/reasoningEngines/456",
-            name="agent-1",
-            instruction="agent-1 instruction",
-            tool_declarations=[tool],
-        ),
+        agent_info=AGENT_INFO,
         labels={"label1": "value1"},
     )
     assert isinstance(evaluation_run, types.EvaluationRun)
     assert evaluation_run.display_name == "test4"
     assert evaluation_run.state == types.EvaluationRunState.PENDING
     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-    assert evaluation_run.data_source.evaluation_set == (
-        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-    )
+    assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
     assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
@@ -122,13 +135,13 @@ def test_create_eval_run_data_source_evaluation_set(client):
         ],
     )
     assert evaluation_run.inference_configs[
-        "agent-1"
+        AGENT_INFO.name
     ] == types.EvaluationRunInferenceConfig(
         agent_config=types.EvaluationRunAgentConfig(
             developer_instruction=genai_types.Content(
                 parts=[genai_types.Part(text="agent-1 instruction")]
             ),
-            tools=[tool],
+            tools=[TOOL],
         )
     )
     assert evaluation_run.labels == {
@@ -190,14 +203,15 @@ def test_create_eval_run_with_inference_configs(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs."""
     client._api_client._http_options.api_version = "v1beta1"
     inference_config = types.EvaluationRunInferenceConfig(
-        model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+        model=MODEL_NAME,
+        prompt_template=types.EvaluationRunPromptTemplate(
+            prompt_template="test prompt template"
+        ),
     )
     evaluation_run = client.evals.create_evaluation_run(
         name="test_inference_config",
         display_name="test_inference_config",
-        dataset=types.EvaluationRunDataSource(
-            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-        ),
+        dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
         dest=GCS_DEST,
         metrics=[GENERAL_QUALITY_METRIC],
         inference_configs={"model_1": inference_config},
@@ -207,9 +221,7 @@ def test_create_eval_run_with_inference_configs(client):
     assert evaluation_run.display_name == "test_inference_config"
     assert evaluation_run.state == types.EvaluationRunState.PENDING
     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-    assert evaluation_run.data_source.evaluation_set == (
-        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-    )
+    assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
     assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
@@ -223,9 +235,11 @@ def test_create_eval_run_with_inference_configs(client):
     assert evaluation_run.error is None
 
 
-# Test fails in replay mode because of UUID generation mismatch.
+# Dataframe tests fail in replay mode because of UUID generation mismatch.
 # def test_create_eval_run_data_source_evaluation_dataset(client):
-#     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
+#     """Tests that create_evaluation_run() creates a correctly structured
+#     EvaluationRun with EvaluationDataset.
+#     """
 #     input_df = pd.DataFrame(
 #         {
 #             "prompt": ["prompt1", "prompt2"],
@@ -275,7 +289,7 @@ def test_create_eval_run_with_inference_configs(client):
 #         name="test6",
 #         display_name="test6",
 #         dataset=types.EvaluationDataset(
-#             candidate_name="candidate_1",
+#             candidate_name=CANDIDATE_NAME,
 #             eval_dataset_df=input_df,
 #         ),
 #         dest=GCS_DEST,
@@ -319,6 +333,196 @@ def test_create_eval_run_with_inference_configs(client):
 #     assert evaluation_run.error is None
 
 
+# def test_create_eval_run_data_source_evaluation_dataset_with_inference_configs_and_prompt_template_data(
+#     client,
+# ):
+#     """Tests that create_evaluation_run() creates a correctly structured
+#     EvaluationRun with EvaluationDataset and inference_configs.
+#     Prompt template data is inferred from the dataset and a default prompt
+#     template should be used.
+#     """
+#     evaluation_run = client.evals.create_evaluation_run(
+#         name="test9",
+#         display_name="test9",
+#         dataset=types.EvaluationDataset(
+#             candidate_name=CANDIDATE_NAME,
+#             eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
+#         ),
+#         dest=GCS_DEST,
+#         metrics=[GENERAL_QUALITY_METRIC],
+#         inference_configs={
+#             CANDIDATE_NAME: types.EvaluationRunInferenceConfig(
+#                 model=MODEL_NAME,
+#             )
+#         },
+#     )
+#     assert isinstance(evaluation_run, types.EvaluationRun)
+#     assert evaluation_run.display_name == "test9"
+#     assert evaluation_run.state == types.EvaluationRunState.PENDING
+#     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+#     assert evaluation_run.inference_configs[
+#         CANDIDATE_NAME
+#     ] == types.EvaluationRunInferenceConfig(
+#         model=MODEL_NAME,
+#         prompt_template=types.EvaluationRunPromptTemplate(
+#             prompt_template=DEFAULT_PROMPT_TEMPLATE
+#         ),
+#     )
+#     # Check evaluation set
+#     assert evaluation_run.data_source.evaluation_set
+#     eval_set = client.evals.get_evaluation_set(
+#         name=evaluation_run.data_source.evaluation_set
+#     )
+#     assert len(eval_set.evaluation_items) == 2
+#     # Check evaluation items
+#     for i, eval_item_name in enumerate(eval_set.evaluation_items):
+#         eval_item = client.evals.get_evaluation_item(name=eval_item_name)
+#         assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
+#         assert (
+#             eval_item.evaluation_request.prompt.prompt_template_data.values[
+#                 "prompt"
+#             ]
+#             == genai_types.Content(
+#                 parts=[
+#                     genai_types.Part(
+#                         text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
+#                     )
+#                 ],
+#                 role="user",
+#             )
+#         )
+#         assert (
+#             eval_item.evaluation_request.prompt.prompt_template_data.values[
+#                 "context"
+#             ]
+#             == genai_types.Content(
+#                 parts=[
+#                     genai_types.Part(
+#                         text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
+#                     )
+#                 ],
+#                 role="user",
+#             )
+#         )
+#         assert (
+#             eval_item.evaluation_request.prompt.prompt_template_data.values[
+#                 "conversation_history"
+#             ]
+#             == genai_types.Content(
+#                 parts=[
+#                     genai_types.Part(
+#                         text=(
+#                             INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
+#                                 "conversation_history"
+#                             ]
+#                         )
+#                     )
+#                 ],
+#                 role="user",
+#             )
+#         )
+#         assert (
+#             eval_item.evaluation_request.candidate_responses[0].text
+#             == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
+#         )
+#     assert evaluation_run.error is None
+
+
+# def test_create_eval_run_data_source_evaluation_dataset_with_agent_info_and_prompt_template_data(
+#     client,
+# ):
+#     """Tests that create_evaluation_run() creates a correctly structured
+#     EvaluationRun with EvaluationDataset and agent_info.
+#     Prompt template data is inferred from the dataset and a default prompt
+#     template should be used.
+#     """
+#     evaluation_run = client.evals.create_evaluation_run(
+#         name="test9",
+#         display_name="test9",
+#         dataset=types.EvaluationDataset(
+#             candidate_name=CANDIDATE_NAME,
+#             eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
+#         ),
+#         dest=GCS_DEST,
+#         metrics=[GENERAL_QUALITY_METRIC],
+#         agent_info=AGENT_INFO,
+#     )
+#     assert isinstance(evaluation_run, types.EvaluationRun)
+#     assert evaluation_run.display_name == "test9"
+#     assert evaluation_run.state == types.EvaluationRunState.PENDING
+#     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+#     assert evaluation_run.inference_configs[
+#         AGENT_INFO.name
+#     ] == types.EvaluationRunInferenceConfig(
+#         agent_config=types.EvaluationRunAgentConfig(
+#             developer_instruction=genai_types.Content(
+#                 parts=[genai_types.Part(text=AGENT_INFO.instruction)]
+#             ),
+#             tools=[TOOL],
+#         ),
+#         prompt_template=types.EvaluationRunPromptTemplate(
+#             prompt_template=DEFAULT_PROMPT_TEMPLATE
+#         ),
+#     )
+#     # Check evaluation set
+#     assert evaluation_run.data_source.evaluation_set
+#     eval_set = client.evals.get_evaluation_set(
+#         name=evaluation_run.data_source.evaluation_set
+#     )
+#     assert len(eval_set.evaluation_items) == 2
+#     # Check evaluation items
+#     for i, eval_item_name in enumerate(eval_set.evaluation_items):
+#         eval_item = client.evals.get_evaluation_item(name=eval_item_name)
+#         assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
+#         assert (
+#             eval_item.evaluation_request.prompt.prompt_template_data.values[
+#                 "prompt"
+#             ]
+#             == genai_types.Content(
+#                 parts=[
+#                     genai_types.Part(
+#                         text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
+#                     )
+#                 ],
+#                 role="user",
+#             )
+#         )
+#         assert (
+#             eval_item.evaluation_request.prompt.prompt_template_data.values[
+#                 "context"
+#             ]
+#             == genai_types.Content(
+#                 parts=[
+#                     genai_types.Part(
+#                         text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
+#                     )
+#                 ],
+#                 role="user",
+#             )
+#         )
+#         assert (
+#             eval_item.evaluation_request.prompt.prompt_template_data.values[
+#                 "conversation_history"
+#             ]
+#             == genai_types.Content(
+#                 parts=[
+#                     genai_types.Part(
+#                         text=(
+#                             INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
+#                                 "conversation_history"
+#                             ]
+#                         )
+#                     )
+#                 ],
+#                 role="user",
+#             )
+#         )
+#         assert (
+#             eval_item.evaluation_request.candidate_responses[0].text
+#             == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
+#         )
+#     assert evaluation_run.error is None
+
 pytest_plugins = ("pytest_asyncio",)
 
 
@@ -371,14 +575,15 @@ async def test_create_eval_run_async_with_inference_configs(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously."""
     client._api_client._http_options.api_version = "v1beta1"
     inference_config = types.EvaluationRunInferenceConfig(
-        model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+        model=MODEL_NAME,
+        prompt_template=types.EvaluationRunPromptTemplate(
+            prompt_template="Test the {prompt}"
+        ),
     )
     evaluation_run = await client.aio.evals.create_evaluation_run(
         name="test_inference_config_async",
         display_name="test_inference_config_async",
-        dataset=types.EvaluationRunDataSource(
-            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-        ),
+        dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
         dest=GCS_DEST,
         metrics=[GENERAL_QUALITY_METRIC],
         inference_configs={"model_1": inference_config},
@@ -388,9 +593,7 @@ async def test_create_eval_run_async_with_inference_configs(client):
     assert evaluation_run.display_name == "test_inference_config_async"
     assert evaluation_run.state == types.EvaluationRunState.PENDING
     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-    assert evaluation_run.data_source.evaluation_set == (
-        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-    )
+    assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
     assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)