Skip to content

Commit 0255db4

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Add PromptTemplateData to support context and history columns when creating Evaluation run from dataframe
PiperOrigin-RevId: 871483777
1 parent 204e5b4 commit 0255db4

File tree

6 files changed

+409
-63
lines changed

6 files changed

+409
-63
lines changed

tests/unit/vertexai/genai/replays/test_create_evaluation_run.py

Lines changed: 246 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from vertexai import types
1919
from google.genai import types as genai_types
2020
import pytest
21+
import pandas as pd
2122

2223
GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
2324
GENERAL_QUALITY_METRIC = types.EvaluationRunMetric(
@@ -63,29 +64,48 @@
6364
)
6465
),
6566
)
67+
TOOL = genai_types.Tool(
68+
function_declarations=[
69+
genai_types.FunctionDeclaration(
70+
name="get_weather",
71+
description="Get weather in a location",
72+
parameters={
73+
"type": "object",
74+
"properties": {"location": {"type": "string"}},
75+
},
76+
)
77+
]
78+
)
79+
AGENT_INFO = types.evals.AgentInfo(
80+
agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456",
81+
name="agent-1",
82+
instruction="agent-1 instruction",
83+
tool_declarations=[TOOL],
84+
)
85+
DEFAULT_PROMPT_TEMPLATE = "{prompt}"
86+
INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd.DataFrame(
87+
{
88+
"prompt": ["prompt1", "prompt2"],
89+
"reference": ["reference1", "reference2"],
90+
"response": ["response1", "response2"],
91+
"context": ["context1", "context2"],
92+
"conversation_history": ["history1", "history2"],
93+
}
94+
)
95+
CANDIDATE_NAME = "candidate_1"
96+
MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
97+
EVAL_SET_NAME = (
98+
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
99+
)
66100

67101

68102
def test_create_eval_run_data_source_evaluation_set(client):
69103
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
70104
client._api_client._http_options.api_version = "v1beta1"
71-
tool = genai_types.Tool(
72-
function_declarations=[
73-
genai_types.FunctionDeclaration(
74-
name="get_weather",
75-
description="Get weather in a location",
76-
parameters={
77-
"type": "object",
78-
"properties": {"location": {"type": "string"}},
79-
},
80-
)
81-
]
82-
)
83105
evaluation_run = client.evals.create_evaluation_run(
84106
name="test4",
85107
display_name="test4",
86-
dataset=types.EvaluationRunDataSource(
87-
evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
88-
),
108+
dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
89109
dest=GCS_DEST,
90110
metrics=[
91111
GENERAL_QUALITY_METRIC,
@@ -94,21 +114,14 @@ def test_create_eval_run_data_source_evaluation_set(client):
94114
EXACT_MATCH_COMPUTATION_BASED_METRIC,
95115
BLEU_COMPUTATION_BASED_METRIC,
96116
],
97-
agent_info=types.evals.AgentInfo(
98-
agent_resource_name="project/123/locations/us-central1/reasoningEngines/456",
99-
name="agent-1",
100-
instruction="agent-1 instruction",
101-
tool_declarations=[tool],
102-
),
117+
agent_info=AGENT_INFO,
103118
labels={"label1": "value1"},
104119
)
105120
assert isinstance(evaluation_run, types.EvaluationRun)
106121
assert evaluation_run.display_name == "test4"
107122
assert evaluation_run.state == types.EvaluationRunState.PENDING
108123
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
109-
assert evaluation_run.data_source.evaluation_set == (
110-
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
111-
)
124+
assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
112125
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
113126
output_config=genai_types.OutputConfig(
114127
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
@@ -122,13 +135,13 @@ def test_create_eval_run_data_source_evaluation_set(client):
122135
],
123136
)
124137
assert evaluation_run.inference_configs[
125-
"agent-1"
138+
AGENT_INFO.name
126139
] == types.EvaluationRunInferenceConfig(
127140
agent_config=types.EvaluationRunAgentConfig(
128141
developer_instruction=genai_types.Content(
129142
parts=[genai_types.Part(text="agent-1 instruction")]
130143
),
131-
tools=[tool],
144+
tools=[TOOL],
132145
)
133146
)
134147
assert evaluation_run.labels == {
@@ -190,14 +203,15 @@ def test_create_eval_run_with_inference_configs(client):
190203
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs."""
191204
client._api_client._http_options.api_version = "v1beta1"
192205
inference_config = types.EvaluationRunInferenceConfig(
193-
model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
206+
model=MODEL_NAME,
207+
prompt_template=types.EvaluationRunPromptTemplate(
208+
prompt_template="test prompt template"
209+
),
194210
)
195211
evaluation_run = client.evals.create_evaluation_run(
196212
name="test_inference_config",
197213
display_name="test_inference_config",
198-
dataset=types.EvaluationRunDataSource(
199-
evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
200-
),
214+
dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
201215
dest=GCS_DEST,
202216
metrics=[GENERAL_QUALITY_METRIC],
203217
inference_configs={"model_1": inference_config},
@@ -207,9 +221,7 @@ def test_create_eval_run_with_inference_configs(client):
207221
assert evaluation_run.display_name == "test_inference_config"
208222
assert evaluation_run.state == types.EvaluationRunState.PENDING
209223
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
210-
assert evaluation_run.data_source.evaluation_set == (
211-
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
212-
)
224+
assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
213225
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
214226
output_config=genai_types.OutputConfig(
215227
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
@@ -223,9 +235,11 @@ def test_create_eval_run_with_inference_configs(client):
223235
assert evaluation_run.error is None
224236

225237

226-
# Test fails in replay mode because of UUID generation mismatch.
238+
# Dataframe tests fail in replay mode because of UUID generation mismatch.
227239
# def test_create_eval_run_data_source_evaluation_dataset(client):
228-
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
240+
# """Tests that create_evaluation_run() creates a correctly structured
241+
# EvaluationRun with EvaluationDataset.
242+
# """
229243
# input_df = pd.DataFrame(
230244
# {
231245
# "prompt": ["prompt1", "prompt2"],
@@ -275,7 +289,7 @@ def test_create_eval_run_with_inference_configs(client):
275289
# name="test6",
276290
# display_name="test6",
277291
# dataset=types.EvaluationDataset(
278-
# candidate_name="candidate_1",
292+
# candidate_name=CANDIDATE_NAME,
279293
# eval_dataset_df=input_df,
280294
# ),
281295
# dest=GCS_DEST,
@@ -319,6 +333,196 @@ def test_create_eval_run_with_inference_configs(client):
319333
# assert evaluation_run.error is None
320334

321335

336+
# def test_create_eval_run_data_source_evaluation_dataset_with_inference_configs_and_prompt_template_data(
337+
# client,
338+
# ):
339+
# """Tests that create_evaluation_run() creates a correctly structured
340+
# EvaluationRun with EvaluationDataset and inference_configs.
341+
# Prompt template data is inferred from the dataset and a default prompt
342+
# template should be used.
343+
# """
344+
# evaluation_run = client.evals.create_evaluation_run(
345+
# name="test9",
346+
# display_name="test9",
347+
# dataset=types.EvaluationDataset(
348+
# candidate_name=CANDIDATE_NAME,
349+
# eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
350+
# ),
351+
# dest=GCS_DEST,
352+
# metrics=[GENERAL_QUALITY_METRIC],
353+
# inference_configs={
354+
# CANDIDATE_NAME: types.EvaluationRunInferenceConfig(
355+
# model=MODEL_NAME,
356+
# )
357+
# },
358+
# )
359+
# assert isinstance(evaluation_run, types.EvaluationRun)
360+
# assert evaluation_run.display_name == "test9"
361+
# assert evaluation_run.state == types.EvaluationRunState.PENDING
362+
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
363+
# assert evaluation_run.inference_configs[
364+
# CANDIDATE_NAME
365+
# ] == types.EvaluationRunInferenceConfig(
366+
# model=MODEL_NAME,
367+
# prompt_template=types.EvaluationRunPromptTemplate(
368+
# prompt_template=DEFAULT_PROMPT_TEMPLATE
369+
# ),
370+
# )
371+
# # Check evaluation set
372+
# assert evaluation_run.data_source.evaluation_set
373+
# eval_set = client.evals.get_evaluation_set(
374+
# name=evaluation_run.data_source.evaluation_set
375+
# )
376+
# assert len(eval_set.evaluation_items) == 2
377+
# # Check evaluation items
378+
# for i, eval_item_name in enumerate(eval_set.evaluation_items):
379+
# eval_item = client.evals.get_evaluation_item(name=eval_item_name)
380+
# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
381+
# assert (
382+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
383+
# "prompt"
384+
# ]
385+
# == genai_types.Content(
386+
# parts=[
387+
# genai_types.Part(
388+
# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
389+
# )
390+
# ],
391+
# role="user",
392+
# )
393+
# )
394+
# assert (
395+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
396+
# "context"
397+
# ]
398+
# == genai_types.Content(
399+
# parts=[
400+
# genai_types.Part(
401+
# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
402+
# )
403+
# ],
404+
# role="user",
405+
# )
406+
# )
407+
# assert (
408+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
409+
# "conversation_history"
410+
# ]
411+
# == genai_types.Content(
412+
# parts=[
413+
# genai_types.Part(
414+
# text=(
415+
# INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
416+
# "conversation_history"
417+
# ]
418+
# )
419+
# )
420+
# ],
421+
# role="user",
422+
# )
423+
# )
424+
# assert (
425+
# eval_item.evaluation_request.candidate_responses[0].text
426+
# == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
427+
# )
428+
# assert evaluation_run.error is None
429+
430+
431+
# def test_create_eval_run_data_source_evaluation_dataset_with_agent_info_and_prompt_template_data(
432+
# client,
433+
# ):
434+
# """Tests that create_evaluation_run() creates a correctly structured
435+
# EvaluationRun with EvaluationDataset and agent_info.
436+
# Prompt template data is inferred from the dataset and a default prompt
437+
# template should be used.
438+
# """
439+
# evaluation_run = client.evals.create_evaluation_run(
440+
# name="test9",
441+
# display_name="test9",
442+
# dataset=types.EvaluationDataset(
443+
# candidate_name=CANDIDATE_NAME,
444+
# eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
445+
# ),
446+
# dest=GCS_DEST,
447+
# metrics=[GENERAL_QUALITY_METRIC],
448+
# agent_info=AGENT_INFO,
449+
# )
450+
# assert isinstance(evaluation_run, types.EvaluationRun)
451+
# assert evaluation_run.display_name == "test9"
452+
# assert evaluation_run.state == types.EvaluationRunState.PENDING
453+
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
454+
# assert evaluation_run.inference_configs[
455+
# AGENT_INFO.name
456+
# ] == types.EvaluationRunInferenceConfig(
457+
# agent_config=types.EvaluationRunAgentConfig(
458+
# developer_instruction=genai_types.Content(
459+
# parts=[genai_types.Part(text=AGENT_INFO.instruction)]
460+
# ),
461+
# tools=[TOOL],
462+
# ),
463+
# prompt_template=types.EvaluationRunPromptTemplate(
464+
# prompt_template=DEFAULT_PROMPT_TEMPLATE
465+
# ),
466+
# )
467+
# # Check evaluation set
468+
# assert evaluation_run.data_source.evaluation_set
469+
# eval_set = client.evals.get_evaluation_set(
470+
# name=evaluation_run.data_source.evaluation_set
471+
# )
472+
# assert len(eval_set.evaluation_items) == 2
473+
# # Check evaluation items
474+
# for i, eval_item_name in enumerate(eval_set.evaluation_items):
475+
# eval_item = client.evals.get_evaluation_item(name=eval_item_name)
476+
# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
477+
# assert (
478+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
479+
# "prompt"
480+
# ]
481+
# == genai_types.Content(
482+
# parts=[
483+
# genai_types.Part(
484+
# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
485+
# )
486+
# ],
487+
# role="user",
488+
# )
489+
# )
490+
# assert (
491+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
492+
# "context"
493+
# ]
494+
# == genai_types.Content(
495+
# parts=[
496+
# genai_types.Part(
497+
# text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
498+
# )
499+
# ],
500+
# role="user",
501+
# )
502+
# )
503+
# assert (
504+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
505+
# "conversation_history"
506+
# ]
507+
# == genai_types.Content(
508+
# parts=[
509+
# genai_types.Part(
510+
# text=(
511+
# INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
512+
# "conversation_history"
513+
# ]
514+
# )
515+
# )
516+
# ],
517+
# role="user",
518+
# )
519+
# )
520+
# assert (
521+
# eval_item.evaluation_request.candidate_responses[0].text
522+
# == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
523+
# )
524+
# assert evaluation_run.error is None
525+
322526
pytest_plugins = ("pytest_asyncio",)
323527

324528

@@ -371,14 +575,15 @@ async def test_create_eval_run_async_with_inference_configs(client):
371575
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously."""
372576
client._api_client._http_options.api_version = "v1beta1"
373577
inference_config = types.EvaluationRunInferenceConfig(
374-
model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
578+
model=MODEL_NAME,
579+
prompt_template=types.EvaluationRunPromptTemplate(
580+
prompt_template="Test the {prompt}"
581+
),
375582
)
376583
evaluation_run = await client.aio.evals.create_evaluation_run(
377584
name="test_inference_config_async",
378585
display_name="test_inference_config_async",
379-
dataset=types.EvaluationRunDataSource(
380-
evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
381-
),
586+
dataset=types.EvaluationRunDataSource(evaluation_set=EVAL_SET_NAME),
382587
dest=GCS_DEST,
383588
metrics=[GENERAL_QUALITY_METRIC],
384589
inference_configs={"model_1": inference_config},
@@ -388,9 +593,7 @@ async def test_create_eval_run_async_with_inference_configs(client):
388593
assert evaluation_run.display_name == "test_inference_config_async"
389594
assert evaluation_run.state == types.EvaluationRunState.PENDING
390595
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
391-
assert evaluation_run.data_source.evaluation_set == (
392-
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
393-
)
596+
assert evaluation_run.data_source.evaluation_set == EVAL_SET_NAME
394597
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
395598
output_config=genai_types.OutputConfig(
396599
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)

0 commit comments

Comments
 (0)