googleapis · copybara-service · Feb 17, 2026
@@ -18,6 +18,7 @@
 from vertexai import types
 from google.genai import types as genai_types
 import pytest
+import pandas as pd
 
 GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
 GENERAL_QUALITY_METRIC = types.EvaluationRunMetric(
@@ -63,29 +64,48 @@
         )
     ),
 )
+TOOL = genai_types.Tool(
+    function_declarations=[
+        genai_types.FunctionDeclaration(
+            name="get_weather",
+            description="Get weather in a location",
+            parameters={
+                "type": "object",
+                "properties": {"location": {"type": "string"}},
+            },
+        )
+    ]
+)
+AGENT_INFO = types.evals.AgentInfo(
+    agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456",
+    name="agent-1",
+    instruction="agent-1 instruction",
+    tool_declarations=[TOOL],
+)
+DEFAULT_PROMPT_TEMPLATE = "{prompt}"
+INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd.DataFrame(
+    {
+        "prompt": ["prompt1", "prompt2"],
+        "reference": ["reference1", "reference2"],
+        "response": ["response1", "response2"],
+        "context": ["context1", "context2"],
+        "conversation_history": ["history1", "history2"],
+    }
+)
+CANDIDATE_NAME = "candidate_1"
+MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+EVAL_SET_NAME = (
+    "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+)
 
 
 def test_create_eval_run_data_source_evaluation_set(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
     client._api_client._http_options.api_version = "v1beta1"
-    tool = genai_types.Tool(
-        function_declarations=[
-            genai_types.FunctionDeclaration(
-                name="get_weather",
-                description="Get weather in a location",
-                parameters={
-                    "type": "object",
-                    "properties": {"location": {"type": "string"}},
-                },
-            )
-        ]
-    )
     evaluation_run = client.evals.create_evaluation_run(
         name="test4",
         display_name="test4",
-        dataset=types.EvaluationRunDataSource(
-            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-        ),
+        dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
         dest=GCS_DEST,
         metrics=[
             GENERAL_QUALITY_METRIC,
@@ -94,21 +114,14 @@ def test_create_eval_run_data_source_evaluation_set(client):
             EXACT_MATCH_COMPUTATION_BASED_METRIC,
             BLEU_COMPUTATION_BASED_METRIC,
         ],
-        agent_info=types.evals.AgentInfo(
-            agent_resource_name="project/123/locations/us-central1/reasoningEngines/456",
-            name="agent-1",
-            instruction="agent-1 instruction",
-            tool_declarations=[tool],
-        ),
+        agent_info=AGENT_INFO,
         labels={"label1": "value1"},
     )
     assert isinstance(evaluation_run, types.EvaluationRun)
     assert evaluation_run.display_name == "test4"
     assert evaluation_run.state == types.EvaluationRunState.PENDING
     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-    assert evaluation_run.data_source.evaluation_set == (
-        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-    )
+    assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
     assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
@@ -122,13 +135,13 @@ def test_create_eval_run_data_source_evaluation_set(client):
         ],
     )
     assert evaluation_run.inference_configs[
-        "agent-1"
+        AGENT_INFO.name
     ] == types.EvaluationRunInferenceConfig(
         agent_config=types.EvaluationRunAgentConfig(
             developer_instruction=genai_types.Content(
                 parts=[genai_types.Part(text="agent-1 instruction")]
             ),
-            tools=[tool],
+            tools=[TOOL],
         )
     )
     assert evaluation_run.labels == {
@@ -190,14 +203,15 @@ def test_create_eval_run_with_inference_configs(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs."""
     client._api_client._http_options.api_version = "v1beta1"
     inference_config = types.EvaluationRunInferenceConfig(
-        model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+        model=MODEL_NAME,
+        prompt_template=types.EvaluationRunPromptTemplate(
+            prompt_template="test prompt template"
+        ),
     )
     evaluation_run = client.evals.create_evaluation_run(
         name="test_inference_config",
         display_name="test_inference_config",
-        dataset=types.EvaluationRunDataSource(
-            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-        ),
+        dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
         dest=GCS_DEST,
         metrics=[GENERAL_QUALITY_METRIC],
         inference_configs={"model_1": inference_config},
@@ -207,9 +221,7 @@ def test_create_eval_run_with_inference_configs(client):
     assert evaluation_run.display_name == "test_inference_config"
     assert evaluation_run.state == types.EvaluationRunState.PENDING
     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-    assert evaluation_run.data_source.evaluation_set == (
-        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-    )
+    assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
     assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
@@ -223,9 +235,11 @@ def test_create_eval_run_with_inference_configs(client):
     assert evaluation_run.error is None
 
 
-# Test fails in replay mode because of UUID generation mismatch.
+# Dataframe tests fail in replay mode because of UUID generation mismatch.
 # def test_create_eval_run_data_source_evaluation_dataset(client):
-#     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
+#     """Tests that create_evaluation_run() creates a correctly structured
+#     EvaluationRun with EvaluationDataset.
+#     """
 #     input_df = pd.DataFrame(
 #         {
 #             "prompt": ["prompt1", "prompt2"],
@@ -275,7 +289,7 @@ def test_create_eval_run_with_inference_configs(client):
 #         name="test6",
 #         display_name="test6",
 #         dataset=types.EvaluationDataset(
-#             candidate_name="candidate_1",
+#             candidate_name=CANDIDATE_NAME,
 #             eval_dataset_df=input_df,
 #         ),
 #         dest=GCS_DEST,
@@ -319,6 +333,196 @@ def test_create_eval_run_with_inference_configs(client):
 #     assert evaluation_run.error is None
 
 
+def test_create_eval_run_data_source_evaluation_dataset_with_inference_configs_and_prompt_template_data(
+    client,
+):
+    """Tests that create_evaluation_run() creates a correctly structured
+    EvaluationRun with EvaluationDataset and inference_configs.
+    Prompt template data is inferred from the dataset and a default prompt
+    template should be used.
+    """
+    evaluation_run = client.evals.create_evaluation_run(
+        name="test9",
+        display_name="test9",
+        dataset=types.EvaluationDataset(
+            candidate_name=CANDIDATE_NAME,
+            eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
+        ),
+        dest=GCS_DEST,
+        metrics=[GENERAL_QUALITY_METRIC],
+        inference_configs={
+            CANDIDATE_NAME: types.EvaluationRunInferenceConfig(
+                model=MODEL_NAME,
+            )
+        },
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.display_name == "test9"
+    assert evaluation_run.state == types.EvaluationRunState.PENDING
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    assert evaluation_run.inference_configs[
+        CANDIDATE_NAME
+    ] == types.EvaluationRunInferenceConfig(
+        model=MODEL_NAME,
+        prompt_template=types.EvaluationRunPromptTemplate(
+            prompt_template=DEFAULT_PROMPT_TEMPLATE
+        ),
+    )
+    # Check evaluation set
+    assert evaluation_run.data_source.evaluation_set
+    eval_set = client.evals.get_evaluation_set(
+        name=evaluation_run.data_source.evaluation_set
+    )
+    assert len(eval_set.evaluation_items) == 2
+    # Check evaluation items
+    for i, eval_item_name in enumerate(eval_set.evaluation_items):
+        eval_item = client.evals.get_evaluation_item(name=eval_item_name)
+        assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "prompt"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "context"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "conversation_history"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=(
+                            INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
+                                "conversation_history"
+                            ]
+                        )
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].text
+            == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
+        )
+    assert evaluation_run.error is None
+
+
+def test_create_eval_run_data_source_evaluation_dataset_with_agent_info_and_prompt_template_data(
+    client,
+):
+    """Tests that create_evaluation_run() creates a correctly structured
+    EvaluationRun with EvaluationDataset and agent_info.
+    Prompt template data is inferred from the dataset and a default prompt
+    template should be used.
+    """
+    evaluation_run = client.evals.create_evaluation_run(
+        name="test9",
+        display_name="test9",
+        dataset=types.EvaluationDataset(
+            candidate_name=CANDIDATE_NAME,
+            eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
+        ),
+        dest=GCS_DEST,
+        metrics=[GENERAL_QUALITY_METRIC],
+        agent_info=AGENT_INFO,
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.display_name == "test9"
+    assert evaluation_run.state == types.EvaluationRunState.PENDING
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    assert evaluation_run.inference_configs[
+        AGENT_INFO.name
+    ] == types.EvaluationRunInferenceConfig(
+        agent_config=types.EvaluationRunAgentConfig(
+            developer_instruction=genai_types.Content(
+                parts=[genai_types.Part(text=AGENT_INFO.instruction)]
+            ),
+            tools=[TOOL],
+        ),
+        prompt_template=types.EvaluationRunPromptTemplate(
+            prompt_template=DEFAULT_PROMPT_TEMPLATE
+        ),
+    )
+    # Check evaluation set
+    assert evaluation_run.data_source.evaluation_set
+    eval_set = client.evals.get_evaluation_set(
+        name=evaluation_run.data_source.evaluation_set
+    )
+    assert len(eval_set.evaluation_items) == 2
+    # Check evaluation items
+    for i, eval_item_name in enumerate(eval_set.evaluation_items):
+        eval_item = client.evals.get_evaluation_item(name=eval_item_name)
+        assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "prompt"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "context"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "conversation_history"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=(
+                            INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
+                                "conversation_history"
+                            ]
+                        )
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].text
+            == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
+        )
+    assert evaluation_run.error is None
+
 pytest_plugins = ("pytest_asyncio",)
 
 
@@ -371,14 +575,15 @@ async def test_create_eval_run_async_with_inference_configs(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously."""
     client._api_client._http_options.api_version = "v1beta1"
     inference_config = types.EvaluationRunInferenceConfig(
-        model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+        model=MODEL_NAME,
+        prompt_template=types.EvaluationRunPromptTemplate(
+            prompt_template="Test the {prompt}"
+        ),
     )
     evaluation_run = await client.aio.evals.create_evaluation_run(
         name="test_inference_config_async",
         display_name="test_inference_config_async",
-        dataset=types.EvaluationRunDataSource(
-            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-        ),
+        dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
         dest=GCS_DEST,
         metrics=[GENERAL_QUALITY_METRIC],
         inference_configs={"model_1": inference_config},
@@ -388,9 +593,7 @@ async def test_create_eval_run_async_with_inference_configs(client):
     assert evaluation_run.display_name == "test_inference_config_async"
     assert evaluation_run.state == types.EvaluationRunState.PENDING
     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-    assert evaluation_run.data_source.evaluation_set == (
-        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-    )
+    assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
     assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)