From b8d2c9958e5a0ce8c7efba6f6b7071aa650afffe Mon Sep 17 00:00:00 2001
From: A Vertex SDK engineer <vertex-sdk-bot@google.com>
Date: Tue, 17 Feb 2026 13:58:22 -0800
Subject: [PATCH] feat: Add PromptTemplateData to support `context` and
 `history` columns when creating Evaluation run from dataframe

PiperOrigin-RevId: 871483777
---
 .../replays/test_create_evaluation_run.py     | 471 +++++++++++++-----
 vertexai/_genai/_evals_common.py              | 132 ++++-
 vertexai/_genai/_evals_constant.py            |   2 +
 vertexai/_genai/evals.py                      | 124 ++++-
 vertexai/_genai/types/__init__.py             |   6 +
 vertexai/_genai/types/common.py               |  44 ++
 6 files changed, 622 insertions(+), 157 deletions(-)

diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
index cd97ab042c..df125532c2 100644
--- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
+++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
@@ -18,6 +18,7 @@
 from vertexai import types
 from google.genai import types as genai_types
 import pytest
+import pandas as pd
 
 GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
 GENERAL_QUALITY_METRIC = types.EvaluationRunMetric(
@@ -63,29 +64,48 @@
         )
     ),
 )
+TOOL = genai_types.Tool(
+    function_declarations=[
+        genai_types.FunctionDeclaration(
+            name="get_weather",
+            description="Get weather in a location",
+            parameters={
+                "type": "object",
+                "properties": {"location": {"type": "string"}},
+            },
+        )
+    ]
+)
+AGENT_INFO = types.evals.AgentInfo(
+    agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456",
+    name="agent-1",
+    instruction="agent-1 instruction",
+    tool_declarations=[TOOL],
+)
+DEFAULT_PROMPT_TEMPLATE = "{prompt}"
+INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd.DataFrame(
+    {
+        "prompt": ["prompt1", "prompt2"],
+        "reference": ["reference1", "reference2"],
+        "response": ["response1", "response2"],
+        "context": ["context1", "context2"],
+        "conversation_history": ["history1", "history2"],
+    }
+)
+CANDIDATE_NAME = "candidate_1"
+MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+EVAL_SET_NAME = (
+    "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+)
 
 
 def test_create_eval_run_data_source_evaluation_set(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
     client._api_client._http_options.api_version = "v1beta1"
-    tool = genai_types.Tool(
-        function_declarations=[
-            genai_types.FunctionDeclaration(
-                name="get_weather",
-                description="Get weather in a location",
-                parameters={
-                    "type": "object",
-                    "properties": {"location": {"type": "string"}},
-                },
-            )
-        ]
-    )
     evaluation_run = client.evals.create_evaluation_run(
         name="test4",
         display_name="test4",
-        dataset=types.EvaluationRunDataSource(
-            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-        ),
+        dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
         dest=GCS_DEST,
         metrics=[
             GENERAL_QUALITY_METRIC,
@@ -94,21 +114,14 @@ def test_create_eval_run_data_source_evaluation_set(client):
             EXACT_MATCH_COMPUTATION_BASED_METRIC,
             BLEU_COMPUTATION_BASED_METRIC,
         ],
-        agent_info=types.evals.AgentInfo(
-            agent_resource_name="project/123/locations/us-central1/reasoningEngines/456",
-            name="agent-1",
-            instruction="agent-1 instruction",
-            tool_declarations=[tool],
-        ),
+        agent_info=AGENT_INFO,
         labels={"label1": "value1"},
     )
     assert isinstance(evaluation_run, types.EvaluationRun)
     assert evaluation_run.display_name == "test4"
     assert evaluation_run.state == types.EvaluationRunState.PENDING
     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-    assert evaluation_run.data_source.evaluation_set == (
-        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-    )
+    assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
     assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
@@ -122,13 +135,13 @@ def test_create_eval_run_data_source_evaluation_set(client):
         ],
     )
     assert evaluation_run.inference_configs[
-        "agent-1"
+        AGENT_INFO.name
     ] == types.EvaluationRunInferenceConfig(
         agent_config=types.EvaluationRunAgentConfig(
             developer_instruction=genai_types.Content(
                 parts=[genai_types.Part(text="agent-1 instruction")]
             ),
-            tools=[tool],
+            tools=[TOOL],
         )
     )
     assert evaluation_run.labels == {
@@ -190,14 +203,15 @@ def test_create_eval_run_with_inference_configs(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs."""
     client._api_client._http_options.api_version = "v1beta1"
     inference_config = types.EvaluationRunInferenceConfig(
-        model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+        model=MODEL_NAME,
+        prompt_template=types.EvaluationRunPromptTemplate(
+            prompt_template="test prompt template"
+        ),
     )
     evaluation_run = client.evals.create_evaluation_run(
         name="test_inference_config",
         display_name="test_inference_config",
-        dataset=types.EvaluationRunDataSource(
-            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-        ),
+        dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
         dest=GCS_DEST,
         metrics=[GENERAL_QUALITY_METRIC],
         inference_configs={"model_1": inference_config},
@@ -207,9 +221,7 @@ def test_create_eval_run_with_inference_configs(client):
     assert evaluation_run.display_name == "test_inference_config"
     assert evaluation_run.state == types.EvaluationRunState.PENDING
     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-    assert evaluation_run.data_source.evaluation_set == (
-        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-    )
+    assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
     assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
@@ -223,102 +235,294 @@ def test_create_eval_run_with_inference_configs(client):
     assert evaluation_run.error is None
 
 
-# Test fails in replay mode because of UUID generation mismatch.
-# def test_create_eval_run_data_source_evaluation_dataset(client):
-#     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
-#     input_df = pd.DataFrame(
-#         {
-#             "prompt": ["prompt1", "prompt2"],
-#             "reference": ["reference1", "reference2"],
-#             "response": ["response1", "response2"],
-#             "intermediate_events": [
-#                 [
-#                     {
-#                         "content": {
-#                             "parts": [
-#                                 {"text": "first user input"},
-#                             ],
-#                             "role": "user",
-#                         },
-#                     },
-#                     {
-#                         "content": {
-#                             "parts": [
-#                                 {"text": "first model response"},
-#                             ],
-#                             "role": "model",
-#                         },
-#                     },
-#                 ],
-#                 [
-#                     {
-#                         "content": {
-#                             "parts": [
-#                                 {"text": "second user input"},
-#                             ],
-#                             "role": "user",
-#                         },
-#                     },
-#                     {
-#                         "content": {
-#                             "parts": [
-#                                 {"text": "second model response"},
-#                             ],
-#                             "role": "model",
-#                         },
-#                     },
-#                 ],
-#             ],
-#         }
-#     )
-#     evaluation_run = client.evals.create_evaluation_run(
-#         name="test6",
-#         display_name="test6",
-#         dataset=types.EvaluationDataset(
-#             candidate_name="candidate_1",
-#             eval_dataset_df=input_df,
-#         ),
-#         dest=GCS_DEST,
-#         metrics=[GENERAL_QUALITY_METRIC],
-#     )
-#     assert isinstance(evaluation_run, types.EvaluationRun)
-#     assert evaluation_run.display_name == "test6"
-#     assert evaluation_run.state == types.EvaluationRunState.PENDING
-#     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-#     # Check evaluation set
-#     assert evaluation_run.data_source.evaluation_set
-#     eval_set = client.evals.get_evaluation_set(
-#         name=evaluation_run.data_source.evaluation_set
-#     )
-#     assert len(eval_set.evaluation_items) == 2
-#     # Check evaluation items
-#     for i, eval_item_name in enumerate(eval_set.evaluation_items):
-#         eval_item = client.evals.get_evaluation_item(name=eval_item_name)
-#         assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
-#         assert eval_item.evaluation_request.prompt.text == input_df.iloc[i]["prompt"]
-#         assert (
-#             eval_item.evaluation_request.candidate_responses[0].text
-#             == input_df.iloc[i]["response"]
-#         )
-#         assert (
-#             eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text
-#             == input_df.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"]
-#         )
-#         assert (
-#             eval_item.evaluation_request.candidate_responses[0].events[0].role
-#             == input_df.iloc[i]["intermediate_events"][0]["content"]["role"]
-#         )
-#         assert (
-#             eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text
-#             == input_df.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"]
-#         )
-#         assert (
-#             eval_item.evaluation_request.candidate_responses[0].events[1].role
-#             == input_df.iloc[i]["intermediate_events"][1]["content"]["role"]
-#         )
-#     assert evaluation_run.error is None
+# Dataframe tests fail in replay mode because of UUID generation mismatch.
+def test_create_eval_run_data_source_evaluation_dataset(client):
+    """Tests that create_evaluation_run() creates a correctly structured
+    EvaluationRun with EvaluationDataset.
+    """
+    input_df = pd.DataFrame(
+        {
+            "prompt": ["prompt1", "prompt2"],
+            "reference": ["reference1", "reference2"],
+            "response": ["response1", "response2"],
+            "intermediate_events": [
+                [
+                    {
+                        "content": {
+                            "parts": [
+                                {"text": "first user input"},
+                            ],
+                            "role": "user",
+                        },
+                    },
+                    {
+                        "content": {
+                            "parts": [
+                                {"text": "first model response"},
+                            ],
+                            "role": "model",
+                        },
+                    },
+                ],
+                [
+                    {
+                        "content": {
+                            "parts": [
+                                {"text": "second user input"},
+                            ],
+                            "role": "user",
+                        },
+                    },
+                    {
+                        "content": {
+                            "parts": [
+                                {"text": "second model response"},
+                            ],
+                            "role": "model",
+                        },
+                    },
+                ],
+            ],
+        }
+    )
+    evaluation_run = client.evals.create_evaluation_run(
+        name="test6",
+        display_name="test6",
+        dataset=types.EvaluationDataset(
+            candidate_name=CANDIDATE_NAME,
+            eval_dataset_df=input_df,
+        ),
+        dest=GCS_DEST,
+        metrics=[GENERAL_QUALITY_METRIC],
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.display_name == "test6"
+    assert evaluation_run.state == types.EvaluationRunState.PENDING
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    # Check evaluation set
+    assert evaluation_run.data_source.evaluation_set
+    eval_set = client.evals.get_evaluation_set(
+        name=evaluation_run.data_source.evaluation_set
+    )
+    assert len(eval_set.evaluation_items) == 2
+    # Check evaluation items
+    for i, eval_item_name in enumerate(eval_set.evaluation_items):
+        eval_item = client.evals.get_evaluation_item(name=eval_item_name)
+        assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
+        assert eval_item.evaluation_request.prompt.text == input_df.iloc[i]["prompt"]
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].text
+            == input_df.iloc[i]["response"]
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text
+            == input_df.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"]
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].events[0].role
+            == input_df.iloc[i]["intermediate_events"][0]["content"]["role"]
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text
+            == input_df.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"]
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].events[1].role
+            == input_df.iloc[i]["intermediate_events"][1]["content"]["role"]
+        )
+    assert evaluation_run.error is None
 
 
+def test_create_eval_run_data_source_evaluation_dataset_with_inference_configs_and_prompt_template_data(
+    client,
+):
+    """Tests that create_evaluation_run() creates a correctly structured
+    EvaluationRun with EvaluationDataset and inference_configs.
+    Prompt template data is inferred from the dataset and a default prompt
+    template should be used.
+    """
+    evaluation_run = client.evals.create_evaluation_run(
+        name="test9",
+        display_name="test9",
+        dataset=types.EvaluationDataset(
+            candidate_name=CANDIDATE_NAME,
+            eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
+        ),
+        dest=GCS_DEST,
+        metrics=[GENERAL_QUALITY_METRIC],
+        inference_configs={
+            CANDIDATE_NAME: types.EvaluationRunInferenceConfig(
+                model=MODEL_NAME,
+            )
+        },
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.display_name == "test9"
+    assert evaluation_run.state == types.EvaluationRunState.PENDING
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    assert evaluation_run.inference_configs[
+        CANDIDATE_NAME
+    ] == types.EvaluationRunInferenceConfig(
+        model=MODEL_NAME,
+        prompt_template=types.EvaluationRunPromptTemplate(
+            prompt_template=DEFAULT_PROMPT_TEMPLATE
+        ),
+    )
+    # Check evaluation set
+    assert evaluation_run.data_source.evaluation_set
+    eval_set = client.evals.get_evaluation_set(
+        name=evaluation_run.data_source.evaluation_set
+    )
+    assert len(eval_set.evaluation_items) == 2
+    # Check evaluation items
+    for i, eval_item_name in enumerate(eval_set.evaluation_items):
+        eval_item = client.evals.get_evaluation_item(name=eval_item_name)
+        assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "prompt"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "context"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "conversation_history"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=(
+                            INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
+                                "conversation_history"
+                            ]
+                        )
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].text
+            == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
+        )
+    assert evaluation_run.error is None
+
+
+def test_create_eval_run_data_source_evaluation_dataset_with_agent_info_and_prompt_template_data(
+    client,
+):
+    """Tests that create_evaluation_run() creates a correctly structured
+    EvaluationRun with EvaluationDataset and agent_info.
+    Prompt template data is inferred from the dataset and a default prompt
+    template should be used.
+    """
+    evaluation_run = client.evals.create_evaluation_run(
+        name="test9",
+        display_name="test9",
+        dataset=types.EvaluationDataset(
+            candidate_name=CANDIDATE_NAME,
+            eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
+        ),
+        dest=GCS_DEST,
+        metrics=[GENERAL_QUALITY_METRIC],
+        agent_info=AGENT_INFO,
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.display_name == "test9"
+    assert evaluation_run.state == types.EvaluationRunState.PENDING
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    assert evaluation_run.inference_configs[
+        AGENT_INFO.name
+    ] == types.EvaluationRunInferenceConfig(
+        agent_config=types.EvaluationRunAgentConfig(
+            developer_instruction=genai_types.Content(
+                parts=[genai_types.Part(text=AGENT_INFO.instruction)]
+            ),
+            tools=[TOOL],
+        ),
+        prompt_template=types.EvaluationRunPromptTemplate(
+            prompt_template=DEFAULT_PROMPT_TEMPLATE
+        ),
+    )
+    # Check evaluation set
+    assert evaluation_run.data_source.evaluation_set
+    eval_set = client.evals.get_evaluation_set(
+        name=evaluation_run.data_source.evaluation_set
+    )
+    assert len(eval_set.evaluation_items) == 2
+    # Check evaluation items
+    for i, eval_item_name in enumerate(eval_set.evaluation_items):
+        eval_item = client.evals.get_evaluation_item(name=eval_item_name)
+        assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "prompt"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "context"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.prompt.prompt_template_data.values[
+                "conversation_history"
+            ]
+            == genai_types.Content(
+                parts=[
+                    genai_types.Part(
+                        text=(
+                            INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
+                                "conversation_history"
+                            ]
+                        )
+                    )
+                ],
+                role="user",
+            )
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].text
+            == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
+        )
+    assert evaluation_run.error is None
+
 pytest_plugins = ("pytest_asyncio",)
 
 
@@ -371,14 +575,15 @@ async def test_create_eval_run_async_with_inference_configs(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously."""
     client._api_client._http_options.api_version = "v1beta1"
     inference_config = types.EvaluationRunInferenceConfig(
-        model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+        model=MODEL_NAME,
+        prompt_template=types.EvaluationRunPromptTemplate(
+            prompt_template="Test the {prompt}"
+        ),
     )
     evaluation_run = await client.aio.evals.create_evaluation_run(
         name="test_inference_config_async",
         display_name="test_inference_config_async",
-        dataset=types.EvaluationRunDataSource(
-            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-        ),
+        dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
         dest=GCS_DEST,
         metrics=[GENERAL_QUALITY_METRIC],
         inference_configs={"model_1": inference_config},
@@ -388,9 +593,7 @@ async def test_create_eval_run_async_with_inference_configs(client):
     assert evaluation_run.display_name == "test_inference_config_async"
     assert evaluation_run.state == types.EvaluationRunState.PENDING
     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-    assert evaluation_run.data_source.evaluation_set == (
-        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-    )
+    assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
     assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
index 0bc28994ed..3a6b11ddf2 100644
--- a/vertexai/_genai/_evals_common.py
+++ b/vertexai/_genai/_evals_common.py
@@ -277,13 +277,51 @@ def _resolve_dataset(
     return dataset
 
 
+def _get_default_prompt_template(
+    api_client: BaseApiClient,
+    inference_config: types.EvaluationRunInferenceConfigOrDict,
+    dataset: types.EvaluationRunDataSource,
+) -> Any:
+    """Resolves prompt template data for the evaluation run."""
+    if isinstance(inference_config, dict):
+        if inference_config.get("prompt_template"):
+            return inference_config["prompt_template"]
+    elif inference_config.prompt_template:
+        return inference_config.prompt_template
+
+    try:
+        evals_module = evals.Evals(api_client_=api_client)
+        eval_set = evals_module.get_evaluation_set(name=dataset.evaluation_set)
+        if eval_set and eval_set.evaluation_items:
+            eval_item = evals_module.get_evaluation_item(
+                name=eval_set.evaluation_items[0]
+            )
+            if (
+                eval_item
+                and eval_item.evaluation_request
+                and eval_item.evaluation_request.prompt
+                and eval_item.evaluation_request.prompt.prompt_template_data
+            ):
+                if (
+                    "prompt"
+                    in eval_item.evaluation_request.prompt.prompt_template_data.values
+                ):
+                    return "{prompt}"
+    except Exception as e:
+        logger.warning("Failed to get prompt template from evaluation set: %s", e)
+    return None
+
+
 def _resolve_inference_configs(
+    api_client: BaseApiClient,
+    dataset: types.EvaluationRunDataSource,
     inference_configs: Optional[
         dict[str, types.EvaluationRunInferenceConfigOrDict]
     ] = None,
     agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
 ) -> Optional[dict[str, types.EvaluationRunInferenceConfigOrDict]]:
     """Resolves inference configs for the evaluation run."""
+    # Resolve agent config
     if agent_info_pydantic and agent_info_pydantic.name:
         inference_configs = {}
         inference_configs[agent_info_pydantic.name] = (
@@ -296,13 +334,41 @@ def _resolve_inference_configs(
                 )
             )
         )
+    # Resolve prompt template data
+    if inference_configs:
+        for inference_config in inference_configs.values():
+            prompt_template_val = (
+                inference_config.get("prompt_template")
+                if isinstance(inference_config, dict)
+                else inference_config.prompt_template
+            )
+            if not prompt_template_val:
+                default_prompt_template = _get_default_prompt_template(
+                    api_client, inference_config, dataset
+                )
+                if default_prompt_template:
+                    prompt_template_to_set = default_prompt_template
+                    if not isinstance(
+                        default_prompt_template, types.EvaluationRunPromptTemplate
+                    ):
+                        prompt_template_to_set = types.EvaluationRunPromptTemplate(
+                            prompt_template=default_prompt_template
+                        )
+                    if isinstance(inference_config, dict):
+                        inference_config[
+                            "prompt_template"
+                        ] = prompt_template_to_set.model_dump(exclude_none=True)
+                    else:
+                        inference_config.prompt_template = (
+                            prompt_template_to_set.model_dump(exclude_none=True)
+                        )
     return inference_configs
 
 
 def _add_evaluation_run_labels(
     labels: Optional[dict[str, str]] = None,
     agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
-) -> Optional[dict[str, str]]:
+):
     """Adds labels to the evaluation run."""
     if agent_info_pydantic and agent_info_pydantic.agent_resource_name:
         labels = labels or {}
@@ -324,8 +390,7 @@ def _get_candidate_name(
         and dataset.candidate_name != agent_info_pydantic.name
     ):
         logger.warning(
-            "Evaluation dataset candidate_name and agent_info.name are different."
-            " Please make sure this is intended."
+            "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
         )
     elif dataset.candidate_name is None and agent_info_pydantic:
         return agent_info_pydantic.name
@@ -1944,6 +2009,21 @@ def _object_to_dict(obj: Any) -> Union[dict[str, Any], Any]:
     return result
 
 
+def _get_content(row: dict[str, Any], column: str):
+    if isinstance(row[column], str):
+        return genai_types.Content(
+            parts=[genai_types.Part(text=row[column])],
+            role=_evals_constant.USER_AUTHOR,
+        )
+    elif isinstance(row[column], genai_types.Content):
+        return row[column]
+    else:
+        raise ValueError(
+            f"{column} must be a string or a Content object. "
+            f"Got {type(row[column])}."
+        )
+
+
 def _create_evaluation_set_from_dataframe(
     api_client: BaseApiClient,
     gcs_dest_prefix: str,
@@ -1962,29 +2042,43 @@ def _create_evaluation_set_from_dataframe(
             for event in row[_evals_constant.INTERMEDIATE_EVENTS]:
                 if CONTENT in event:
                     intermediate_events.append(event[CONTENT])
+        if _evals_constant.CONTEXT in row or _evals_constant.HISTORY in row:
+            values = {}
+            if _evals_constant.CONTEXT in row:
+                values[_evals_constant.CONTEXT] = _get_content(
+                    row, _evals_constant.CONTEXT
+                )
+            if _evals_constant.HISTORY in row:
+                values[_evals_constant.HISTORY] = _get_content(
+                    row, _evals_constant.HISTORY
+                )
+            if _evals_constant.PROMPT in row:
+                values[_evals_constant.PROMPT] = _get_content(
+                    row, _evals_constant.PROMPT
+                )
+            prompt = types.EvaluationPrompt(
+                prompt_template_data=types.PromptTemplateData(values=values)
+            )
+        elif _evals_constant.PROMPT in row:
+            prompt = types.EvaluationPrompt(text=row[_evals_constant.PROMPT])
+        candidate_responses = []
+        if _evals_constant.RESPONSE in row:
+            candidate_responses.append(
+                types.CandidateResponse(
+                    candidate=candidate_name or "Candidate 1",
+                    text=row[_evals_constant.RESPONSE],
+                    events=intermediate_events or None,
+                )
+            )
         eval_item_requests.append(
             types.EvaluationItemRequest(
-                prompt=(
-                    types.EvaluationPrompt(text=row[_evals_constant.PROMPT])
-                    if _evals_constant.PROMPT in row
-                    else None
-                ),
+                prompt=prompt or None,
                 golden_response=(
                     types.CandidateResponse(text=row[_evals_constant.REFERENCE])
                     if _evals_constant.REFERENCE in row
                     else None
                 ),
-                candidate_responses=[
-                    types.CandidateResponse(
-                        candidate=candidate_name or "Candidate 1",
-                        text=row.get(_evals_constant.RESPONSE, None),
-                        events=(
-                            intermediate_events
-                            if len(intermediate_events) > 0
-                            else None
-                        ),
-                    )
-                ],
+                candidate_responses=candidate_responses or None,
             )
         )
     logger.info("Writing evaluation item requests to GCS.")
diff --git a/vertexai/_genai/_evals_constant.py b/vertexai/_genai/_evals_constant.py
index 6fc27d94e0..847140dc5c 100644
--- a/vertexai/_genai/_evals_constant.py
+++ b/vertexai/_genai/_evals_constant.py
@@ -53,6 +53,7 @@
 CONTENT = "content"
 PARTS = "parts"
 USER_AUTHOR = "user"
+HISTORY = "conversation_history"
 
 COMMON_DATASET_COLUMNS = frozenset(
     {
@@ -61,5 +62,6 @@
         REFERENCE,
         SESSION_INPUT,
         CONTEXT,
+        HISTORY,
     }
 )
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
index bbf5a56c56..bf9f932697 100644
--- a/vertexai/_genai/evals.py
+++ b/vertexai/_genai/evals.py
@@ -94,7 +94,12 @@ def _CreateEvaluationRunParameters_to_vertex(
         setv(to_object, ["labels"], getv(from_object, ["labels"]))
 
     if getv(from_object, ["inference_configs"]) is not None:
-        setv(to_object, ["inferenceConfigs"], getv(from_object, ["inference_configs"]))
+        inference_configs = getv(from_object, ["inference_configs"])
+        vertex_inference_configs = {
+            k: _EvaluationRunInferenceConfig_to_vertex(v)
+            for k, v in inference_configs.items()
+        }
+        setv(to_object, ["inferenceConfigs"], vertex_inference_configs)
 
     if getv(from_object, ["config"]) is not None:
         setv(to_object, ["config"], getv(from_object, ["config"]))
@@ -254,6 +259,9 @@ def _EvaluationRunConfig_from_vertex(
     if getv(from_object, ["autoraterConfig"]) is not None:
         setv(to_object, ["autorater_config"], getv(from_object, ["autoraterConfig"]))
 
+    if getv(from_object, ["promptTemplate"]) is not None:
+        setv(to_object, ["prompt_template"], getv(from_object, ["promptTemplate"]))
+
     return to_object
 
 
@@ -278,6 +286,15 @@ def _EvaluationRunConfig_to_vertex(
     if getv(from_object, ["autorater_config"]) is not None:
         setv(to_object, ["autoraterConfig"], getv(from_object, ["autorater_config"]))
 
+    if getv(from_object, ["prompt_template"]) is not None:
+        setv(
+            to_object,
+            ["promptTemplate"],
+            _EvaluationRunPromptTemplate_to_vertex(
+                getv(from_object, ["prompt_template"])
+            ),
+        )
+
     return to_object
 
 
@@ -370,7 +387,12 @@ def _EvaluationRun_from_vertex(
         )
 
     if getv(from_object, ["inferenceConfigs"]) is not None:
-        setv(to_object, ["inference_configs"], getv(from_object, ["inferenceConfigs"]))
+        inference_configs = getv(from_object, ["inferenceConfigs"])
+        vertex_inference_configs = {
+            k: _EvaluationRunInferenceConfig_from_vertex(v)
+            for k, v in inference_configs.items()
+        }
+        setv(to_object, ["inference_configs"], vertex_inference_configs)
 
     if getv(from_object, ["labels"]) is not None:
         setv(to_object, ["labels"], getv(from_object, ["labels"]))
@@ -538,6 +560,99 @@ def _RubricBasedMetricSpec_to_vertex(
     return to_object
 
 
+def _EvaluationRunPromptTemplate_to_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["prompt_template"]) is not None:
+        setv(to_object, ["promptTemplate"], getv(from_object, ["prompt_template"]))
+    if getv(from_object, ["gcs_uri"]) is not None:
+        setv(to_object, ["gcsUri"], getv(from_object, ["gcs_uri"]))
+    return to_object
+
+
+def _EvaluationRunPromptTemplate_from_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["promptTemplate"]) is not None:
+        setv(to_object, ["prompt_template"], getv(from_object, ["promptTemplate"]))
+    if getv(from_object, ["gcsUri"]) is not None:
+        setv(to_object, ["gcs_uri"], getv(from_object, ["gcsUri"]))
+    return to_object
+
+
+def _EvaluationRunAgentConfig_to_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["developer_instruction"]) is not None:
+        setv(to_object, ["developerInstruction"], getv(from_object, ["developer_instruction"]))
+    if getv(from_object, ["tools"]) is not None:
+        setv(to_object, ["tools"], getv(from_object, ["tools"]))
+    return to_object
+
+
+def _EvaluationRunAgentConfig_from_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["developerInstruction"]) is not None:
+        setv(to_object, ["developer_instruction"], getv(from_object, ["developerInstruction"]))
+    if getv(from_object, ["tools"]) is not None:
+        setv(to_object, ["tools"], getv(from_object, ["tools"]))
+    return to_object
+
+
+def _EvaluationRunInferenceConfig_to_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["model"]) is not None:
+        setv(to_object, ["model"], getv(from_object, ["model"]))
+    if getv(from_object, ["prompt_template"]) is not None:
+        setv(
+            to_object,
+            ["promptTemplate"],
+            _EvaluationRunPromptTemplate_to_vertex(getv(from_object, ["prompt_template"])),
+        )
+    if getv(from_object, ["agent_config"]) is not None:
+        setv(to_object, ["agentConfig"], _EvaluationRunAgentConfig_to_vertex(getv(from_object, ["agent_config"]), to_object))
+    if getv(from_object, ["model"]) is not None:
+        setv(to_object, ["model"], getv(from_object, ["model"]))
+    if getv(from_object, ["user_simulator_config"]) is not None:
+        setv(to_object, ["userSimulatorConfig"], getv(from_object, ["user_simulator_config"]))
+    return to_object
+
+
+def _EvaluationRunInferenceConfig_from_vertex(
+    from_object: Union[dict[str, Any], object],
+    parent_object: Optional[dict[str, Any]] = None,
+) -> dict[str, Any]:
+    """Converts a Vertex InferenceConfig response to SDK format."""
+    to_object: dict[str, Any] = {}
+    if getv(from_object, ["model"]) is not None:
+        setv(to_object, ["model"], getv(from_object, ["model"]))
+    if getv(from_object, ["promptTemplate"]) is not None:
+        setv(
+            to_object,
+            ["prompt_template"],
+            _EvaluationRunPromptTemplate_from_vertex(getv(from_object, ["promptTemplate"])),
+        )
+    if getv(from_object, ["agentConfig"]) is not None:
+        setv(to_object, ["agent_config"], _EvaluationRunAgentConfig_from_vertex(getv(from_object, ["agentConfig"]), to_object))
+    if getv(from_object, ["model"]) is not None:
+        setv(to_object, ["model"], getv(from_object, ["model"]))
+    if getv(from_object, ["userSimulatorConfig"]) is not None:
+        setv(to_object, ["user_simulator_config"], getv(from_object, ["userSimulatorConfig"]))
+    return to_object
+
+
 def _RubricGenerationSpec_to_vertex(
     from_object: Union[dict[str, Any], object],
     parent_object: Optional[dict[str, Any]] = None,
@@ -778,6 +893,7 @@ def _create_evaluation_run(
 
         request_dict = _common.convert_to_dict(request_dict)
         request_dict = _common.encode_unserializable_types(request_dict)
+        logger.info("request_dict: %s", request_dict)
 
         response = self._api_client.request("post", path, request_dict, http_options)
 
@@ -1732,7 +1848,7 @@ def create_evaluation_run(
             output_config=output_config, metrics=resolved_metrics
         )
         resolved_inference_configs = _evals_common._resolve_inference_configs(
-            inference_configs, agent_info_pydantic
+            self._api_client, resolved_dataset, inference_configs, agent_info_pydantic
         )
         resolved_labels = _evals_common._add_evaluation_run_labels(
             labels, agent_info_pydantic
@@ -2660,7 +2776,7 @@ async def create_evaluation_run(
             output_config=output_config, metrics=resolved_metrics
         )
         resolved_inference_configs = _evals_common._resolve_inference_configs(
-            inference_configs, agent_info_pydantic
+            self._api_client, resolved_dataset, inference_configs, agent_info_pydantic
         )
         resolved_labels = _evals_common._add_evaluation_run_labels(
             labels, agent_info_pydantic
diff --git a/vertexai/_genai/types/__init__.py b/vertexai/_genai/types/__init__.py
index d3a933f84d..4049382811 100644
--- a/vertexai/_genai/types/__init__.py
+++ b/vertexai/_genai/types/__init__.py
@@ -387,6 +387,9 @@
 from .common import EvaluationRunMetricDict
 from .common import EvaluationRunMetricOrDict
 from .common import EvaluationRunOrDict
+from .common import EvaluationRunPromptTemplate
+from .common import EvaluationRunPromptTemplateDict
+from .common import EvaluationRunPromptTemplateOrDict
 from .common import EvaluationRunResults
 from .common import EvaluationRunResultsDict
 from .common import EvaluationRunResultsOrDict
@@ -1298,6 +1301,9 @@
     "EvaluationRunMetric",
     "EvaluationRunMetricDict",
     "EvaluationRunMetricOrDict",
+    "EvaluationRunPromptTemplate",
+    "EvaluationRunPromptTemplateDict",
+    "EvaluationRunPromptTemplateOrDict",
     "EvaluationRunConfig",
     "EvaluationRunConfigDict",
     "EvaluationRunConfigOrDict",
diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py
index a4ab38e698..2d9fa4c22a 100644
--- a/vertexai/_genai/types/common.py
+++ b/vertexai/_genai/types/common.py
@@ -2287,6 +2287,38 @@ class EvaluationRunMetricDict(TypedDict, total=False):
 EvaluationRunMetricOrDict = Union[EvaluationRunMetric, EvaluationRunMetricDict]
 
 
+class EvaluationRunPromptTemplate(_common.BaseModel):
+    """Prompt template used for inference."""
+
+    prompt_template: Optional[str] = Field(
+        default=None,
+        description="""Inline prompt template. Template variables should be in the format
+      "{var_name}".""",
+    )
+    gcs_uri: Optional[str] = Field(
+        default=None,
+        description="""Prompt template stored in Cloud Storage. Format:
+      "gs://my-bucket/file-name.txt".""",
+    )
+
+
+class EvaluationRunPromptTemplateDict(TypedDict, total=False):
+    """Prompt template used for inference."""
+
+    prompt_template: Optional[str]
+    """Inline prompt template. Template variables should be in the format
+      "{var_name}"."""
+
+    gcs_uri: Optional[str]
+    """Prompt template stored in Cloud Storage. Format:
+      "gs://my-bucket/file-name.txt"."""
+
+
+EvaluationRunPromptTemplateOrDict = Union[
+    EvaluationRunPromptTemplate, EvaluationRunPromptTemplateDict
+]
+
+
 class EvaluationRunConfig(_common.BaseModel):
     """The evaluation configuration used for the evaluation run."""
 
@@ -2300,6 +2332,9 @@ class EvaluationRunConfig(_common.BaseModel):
     autorater_config: Optional[genai_types.AutoraterConfig] = Field(
         default=None, description="""The autorater config for the evaluation run."""
     )
+    prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
+        default=None, description="""The prompt template used for inference."""
+    )
 
 
 class EvaluationRunConfigDict(TypedDict, total=False):
@@ -2314,6 +2349,9 @@ class EvaluationRunConfigDict(TypedDict, total=False):
     autorater_config: Optional[genai_types.AutoraterConfigDict]
     """The autorater config for the evaluation run."""
 
+    prompt_template: Optional[EvaluationRunPromptTemplateDict]
+    """The prompt template used for inference."""
+
 
 EvaluationRunConfigOrDict = Union[EvaluationRunConfig, EvaluationRunConfigDict]
 
@@ -3039,6 +3077,9 @@ class EvaluationRunInferenceConfig(_common.BaseModel):
         default=None,
         description="""The fully qualified name of the publisher model or endpoint to use for inference.""",
     )
+    prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
+        default=None, description="""The prompt template used for inference."""
+    )
     user_simulator_config: Optional[evals_types.UserSimulatorConfig] = Field(
         default=None,
         description="""Used for multi-turn agent run.
@@ -3059,6 +3100,9 @@ class EvaluationRunInferenceConfigDict(TypedDict, total=False):
     model: Optional[str]
     """The fully qualified name of the publisher model or endpoint to use for inference."""
 
+    prompt_template: Optional[EvaluationRunPromptTemplateDict]
+    """The prompt template used for inference."""
+
     user_simulator_config: Optional[evals_types.UserSimulatorConfig]
     """Used for multi-turn agent run.
       Contains configuration for a user simulator that