Skip to content

Commit 4e6c53c

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Add PromptTemplateData to support context and history columns when creating Evaluation run from dataframe
PiperOrigin-RevId: 871483777
1 parent 5705565 commit 4e6c53c

File tree

6 files changed

+406
-47
lines changed

6 files changed

+406
-47
lines changed

tests/unit/vertexai/genai/replays/test_create_evaluation_run.py

Lines changed: 244 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from vertexai import types
1919
from google.genai import types as genai_types
2020
import pytest
21+
import pandas as pd
2122

2223
GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
2324
GENERAL_QUALITY_METRIC = types.EvaluationRunMetric(
@@ -63,28 +64,46 @@
6364
)
6465
),
6566
)
66-
67+
TOOL = genai_types.Tool(
68+
function_declarations=[
69+
genai_types.FunctionDeclaration(
70+
name="get_weather",
71+
description="Get weather in a location",
72+
parameters={
73+
"type": "object",
74+
"properties": {"location": {"type": "string"}},
75+
},
76+
)
77+
]
78+
)
79+
AGENT_INFO = types.evals.AgentInfo(
80+
agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456",
81+
name="agent-1",
82+
instruction="agent-1 instruction",
83+
tool_declarations=[TOOL],
84+
)
85+
DEFAULT_PROMPT_TEMPLATE = "{prompt}"
86+
INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd.DataFrame(
87+
{
88+
"prompt": ["prompt1", "prompt2"],
89+
"reference": ["reference1", "reference2"],
90+
"response": ["response1", "response2"],
91+
"context": ["context1", "context2"],
92+
"conversation_history": ["history1", "history2"],
93+
}
94+
)
95+
CANDIDATE_NAME = "candidate_1"
96+
MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
97+
EVAL_SET_NAME = "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
6798

6899
def test_create_eval_run_data_source_evaluation_set(client):
69100
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
70101
client._api_client._http_options.api_version = "v1beta1"
71-
tool = genai_types.Tool(
72-
function_declarations=[
73-
genai_types.FunctionDeclaration(
74-
name="get_weather",
75-
description="Get weather in a location",
76-
parameters={
77-
"type": "object",
78-
"properties": {"location": {"type": "string"}},
79-
},
80-
)
81-
]
82-
)
83102
evaluation_run = client.evals.create_evaluation_run(
84103
name="test4",
85104
display_name="test4",
86105
dataset=types.EvaluationRunDataSource(
87-
evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
106+
evaluation_set=EVAL_SET_NAME
88107
),
89108
dest=GCS_DEST,
90109
metrics=[
@@ -94,21 +113,14 @@ def test_create_eval_run_data_source_evaluation_set(client):
94113
EXACT_MATCH_COMPUTATION_BASED_METRIC,
95114
BLEU_COMPUTATION_BASED_METRIC,
96115
],
97-
agent_info=types.evals.AgentInfo(
98-
agent_resource_name="project/123/locations/us-central1/reasoningEngines/456",
99-
name="agent-1",
100-
instruction="agent-1 instruction",
101-
tool_declarations=[tool],
102-
),
116+
agent_info=AGENT_INFO,
103117
labels={"label1": "value1"},
104118
)
105119
assert isinstance(evaluation_run, types.EvaluationRun)
106120
assert evaluation_run.display_name == "test4"
107121
assert evaluation_run.state == types.EvaluationRunState.PENDING
108122
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
109-
assert evaluation_run.data_source.evaluation_set == (
110-
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
111-
)
123+
assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
112124
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
113125
output_config=genai_types.OutputConfig(
114126
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
@@ -122,13 +134,13 @@ def test_create_eval_run_data_source_evaluation_set(client):
122134
],
123135
)
124136
assert evaluation_run.inference_configs[
125-
"agent-1"
137+
AGENT_INFO.name
126138
] == types.EvaluationRunInferenceConfig(
127139
agent_config=types.EvaluationRunAgentConfig(
128140
developer_instruction=genai_types.Content(
129141
parts=[genai_types.Part(text="agent-1 instruction")]
130142
),
131-
tools=[tool],
143+
tools=[TOOL],
132144
)
133145
)
134146
assert evaluation_run.labels == {
@@ -190,13 +202,16 @@ def test_create_eval_run_with_inference_configs(client):
190202
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs."""
191203
client._api_client._http_options.api_version = "v1beta1"
192204
inference_config = types.EvaluationRunInferenceConfig(
193-
model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
205+
model=MODEL_NAME,
206+
prompt_template=types.EvaluationRunPromptTemplate(
207+
prompt_template="test prompt template"
208+
),
194209
)
195210
evaluation_run = client.evals.create_evaluation_run(
196211
name="test_inference_config",
197212
display_name="test_inference_config",
198213
dataset=types.EvaluationRunDataSource(
199-
evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
214+
evaluation_set=EVAL_SET_NAME
200215
),
201216
dest=GCS_DEST,
202217
metrics=[GENERAL_QUALITY_METRIC],
@@ -207,9 +222,7 @@ def test_create_eval_run_with_inference_configs(client):
207222
assert evaluation_run.display_name == "test_inference_config"
208223
assert evaluation_run.state == types.EvaluationRunState.PENDING
209224
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
210-
assert evaluation_run.data_source.evaluation_set == (
211-
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
212-
)
225+
assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
213226
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
214227
output_config=genai_types.OutputConfig(
215228
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
@@ -223,9 +236,11 @@ def test_create_eval_run_with_inference_configs(client):
223236
assert evaluation_run.error is None
224237

225238

226-
# Test fails in replay mode because of UUID generation mismatch.
239+
# Dataframe tests fail in replay mode because of UUID generation mismatch.
227240
# def test_create_eval_run_data_source_evaluation_dataset(client):
228-
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
241+
# """Tests that create_evaluation_run() creates a correctly structured
242+
# EvaluationRun with EvaluationDataset.
243+
# """
229244
# input_df = pd.DataFrame(
230245
# {
231246
# "prompt": ["prompt1", "prompt2"],
@@ -275,7 +290,7 @@ def test_create_eval_run_with_inference_configs(client):
275290
# name="test6",
276291
# display_name="test6",
277292
# dataset=types.EvaluationDataset(
278-
# candidate_name="candidate_1",
293+
# candidate_name=CANDIDATE_NAME,
279294
# eval_dataset_df=input_df,
280295
# ),
281296
# dest=GCS_DEST,
@@ -319,6 +334,196 @@ def test_create_eval_run_with_inference_configs(client):
319334
# assert evaluation_run.error is None
320335

321336

337+
# def test_create_eval_run_data_source_evaluation_dataset_with_inference_configs_and_prompt_template_data(
338+
# client,
339+
# ):
340+
# """Tests that create_evaluation_run() creates a correctly structured
341+
# EvaluationRun with EvaluationDataset and inference_configs.
342+
# Prompt template data is inferred from the dataset and a default prompt
343+
# template should be used.
344+
# """
345+
# evaluation_run = client.evals.create_evaluation_run(
346+
# name="test9",
347+
# display_name="test9",
348+
# dataset=types.EvaluationDataset(
349+
# candidate_name=CANDIDATE_NAME,
350+
# eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
351+
# ),
352+
# dest=GCS_DEST,
353+
# metrics=[GENERAL_QUALITY_METRIC],
354+
# inference_configs={
355+
# CANDIDATE_NAME: types.EvaluationRunInferenceConfig(
356+
# model=MODEL_NAME,
357+
# )
358+
# },
359+
# )
360+
# assert isinstance(evaluation_run, types.EvaluationRun)
361+
# assert evaluation_run.display_name == "test9"
362+
# assert evaluation_run.state == types.EvaluationRunState.PENDING
363+
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
364+
# assert evaluation_run.inference_configs[
365+
# CANDIDATE_NAME
366+
# ] == types.EvaluationRunInferenceConfig(
367+
# model=MODEL_NAME,
368+
# prompt_template=types.EvaluationRunPromptTemplate(
369+
# prompt_template=DEFAULT_PROMPT_TEMPLATE
370+
# ),
371+
# )
372+
# # Check evaluation set
373+
# assert evaluation_run.data_source.evaluation_set
374+
# eval_set = client.evals.get_evaluation_set(
375+
# name=evaluation_run.data_source.evaluation_set
376+
# )
377+
# assert len(eval_set.evaluation_items) == 2
378+
# # Check evaluation items
379+
# for i, eval_item_name in enumerate(eval_set.evaluation_items):
380+
# eval_item = client.evals.get_evaluation_item(name=eval_item_name)
381+
# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
382+
# assert (
383+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
384+
# "prompt"
385+
# ]
386+
# == genai_types.Content(
387+
# parts=[
388+
# genai_types.Part(
389+
# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
390+
# )
391+
# ],
392+
# role="user",
393+
# )
394+
# )
395+
# assert (
396+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
397+
# "context"
398+
# ]
399+
# == genai_types.Content(
400+
# parts=[
401+
# genai_types.Part(
402+
# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
403+
# )
404+
# ],
405+
# role="user",
406+
# )
407+
# )
408+
# assert (
409+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
410+
# "conversation_history"
411+
# ]
412+
# == genai_types.Content(
413+
# parts=[
414+
# genai_types.Part(
415+
# text=(
416+
# INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
417+
# "conversation_history"
418+
# ]
419+
# )
420+
# )
421+
# ],
422+
# role="user",
423+
# )
424+
# )
425+
# assert (
426+
# eval_item.evaluation_request.candidate_responses[0].text
427+
# == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
428+
# )
429+
# assert evaluation_run.error is None
430+
431+
432+
# def test_create_eval_run_data_source_evaluation_dataset_with_agent_info_and_prompt_template_data(
433+
# client,
434+
# ):
435+
# """Tests that create_evaluation_run() creates a correctly structured
436+
# EvaluationRun with EvaluationDataset and agent_info.
437+
# Prompt template data is inferred from the dataset and a default prompt
438+
# template should be used.
439+
# """
440+
# evaluation_run = client.evals.create_evaluation_run(
441+
# name="test9",
442+
# display_name="test9",
443+
# dataset=types.EvaluationDataset(
444+
# candidate_name=CANDIDATE_NAME,
445+
# eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
446+
# ),
447+
# dest=GCS_DEST,
448+
# metrics=[GENERAL_QUALITY_METRIC],
449+
# agent_info=AGENT_INFO,
450+
# )
451+
# assert isinstance(evaluation_run, types.EvaluationRun)
452+
# assert evaluation_run.display_name == "test9"
453+
# assert evaluation_run.state == types.EvaluationRunState.PENDING
454+
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
455+
# assert evaluation_run.inference_configs[
456+
# AGENT_INFO.name
457+
# ] == types.EvaluationRunInferenceConfig(
458+
# agent_config=types.EvaluationRunAgentConfig(
459+
# developer_instruction=genai_types.Content(
460+
# parts=[genai_types.Part(text=AGENT_INFO.instruction)]
461+
# ),
462+
# tools=[TOOL],
463+
# ),
464+
# prompt_template=types.EvaluationRunPromptTemplate(
465+
# prompt_template=DEFAULT_PROMPT_TEMPLATE
466+
# ),
467+
# )
468+
# # Check evaluation set
469+
# assert evaluation_run.data_source.evaluation_set
470+
# eval_set = client.evals.get_evaluation_set(
471+
# name=evaluation_run.data_source.evaluation_set
472+
# )
473+
# assert len(eval_set.evaluation_items) == 2
474+
# # Check evaluation items
475+
# for i, eval_item_name in enumerate(eval_set.evaluation_items):
476+
# eval_item = client.evals.get_evaluation_item(name=eval_item_name)
477+
# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
478+
# assert (
479+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
480+
# "prompt"
481+
# ]
482+
# == genai_types.Content(
483+
# parts=[
484+
# genai_types.Part(
485+
# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
486+
# )
487+
# ],
488+
# role="user",
489+
# )
490+
# )
491+
# assert (
492+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
493+
# "context"
494+
# ]
495+
# == genai_types.Content(
496+
# parts=[
497+
# genai_types.Part(
498+
# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
499+
# )
500+
# ],
501+
# role="user",
502+
# )
503+
# )
504+
# assert (
505+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
506+
# "conversation_history"
507+
# ]
508+
# == genai_types.Content(
509+
# parts=[
510+
# genai_types.Part(
511+
# text=(
512+
# INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
513+
# "conversation_history"
514+
# ]
515+
# )
516+
# )
517+
# ],
518+
# role="user",
519+
# )
520+
# )
521+
# assert (
522+
# eval_item.evaluation_request.candidate_responses[0].text
523+
# == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
524+
# )
525+
# assert evaluation_run.error is None
526+
322527
pytest_plugins = ("pytest_asyncio",)
323528

324529

@@ -371,13 +576,16 @@ async def test_create_eval_run_async_with_inference_configs(client):
371576
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously."""
372577
client._api_client._http_options.api_version = "v1beta1"
373578
inference_config = types.EvaluationRunInferenceConfig(
374-
model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
579+
model=MODEL_NAME,
580+
prompt_template=types.EvaluationRunPromptTemplate(
581+
prompt_template="Test the {prompt}"
582+
),
375583
)
376584
evaluation_run = await client.aio.evals.create_evaluation_run(
377585
name="test_inference_config_async",
378586
display_name="test_inference_config_async",
379587
dataset=types.EvaluationRunDataSource(
380-
evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
588+
evaluation_set=EVAL_SET_NAME
381589
),
382590
dest=GCS_DEST,
383591
metrics=[GENERAL_QUALITY_METRIC],
@@ -388,9 +596,7 @@ async def test_create_eval_run_async_with_inference_configs(client):
388596
assert evaluation_run.display_name == "test_inference_config_async"
389597
assert evaluation_run.state == types.EvaluationRunState.PENDING
390598
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
391-
assert evaluation_run.data_source.evaluation_set == (
392-
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
393-
)
599+
assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
394600
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
395601
output_config=genai_types.OutputConfig(
396602
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)

0 commit comments

Comments
 (0)