1818from vertexai import types
1919from google .genai import types as genai_types
2020import pytest
21+ import pandas as pd
2122
2223GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
2324GENERAL_QUALITY_METRIC = types .EvaluationRunMetric (
6364 )
6465 ),
6566)
67+ TOOL = genai_types .Tool (
68+ function_declarations = [
69+ genai_types .FunctionDeclaration (
70+ name = "get_weather" ,
71+ description = "Get weather in a location" ,
72+ parameters = {
73+ "type" : "object" ,
74+ "properties" : {"location" : {"type" : "string" }},
75+ },
76+ )
77+ ]
78+ )
79+ AGENT_INFO = types .evals .AgentInfo (
80+ agent_resource_name = "projects/123/locations/us-central1/reasoningEngines/456" ,
81+ name = "agent-1" ,
82+ instruction = "agent-1 instruction" ,
83+ tool_declarations = [TOOL ],
84+ )
85+ DEFAULT_PROMPT_TEMPLATE = "{prompt}"
86+ INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd .DataFrame (
87+ {
88+ "prompt" : ["prompt1" , "prompt2" ],
89+ "reference" : ["reference1" , "reference2" ],
90+ "response" : ["response1" , "response2" ],
91+ "context" : ["context1" , "context2" ],
92+ "conversation_history" : ["history1" , "history2" ],
93+ }
94+ )
95+ CANDIDATE_NAME = "candidate_1"
96+ MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
97+ EVAL_SET_NAME = (
98+ "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
99+ )
66100
67101
68102def test_create_eval_run_data_source_evaluation_set (client ):
69103 """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
70104 client ._api_client ._http_options .api_version = "v1beta1"
71- tool = genai_types .Tool (
72- function_declarations = [
73- genai_types .FunctionDeclaration (
74- name = "get_weather" ,
75- description = "Get weather in a location" ,
76- parameters = {
77- "type" : "object" ,
78- "properties" : {"location" : {"type" : "string" }},
79- },
80- )
81- ]
82- )
83105 evaluation_run = client .evals .create_evaluation_run (
84106 name = "test4" ,
85107 display_name = "test4" ,
86- dataset = types .EvaluationRunDataSource (
87- evaluation_set = "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
88- ),
108+ dataset = types .EvaluationRunDataSource (evaluation_set = EVAL_SET_NAME ),
89109 dest = GCS_DEST ,
90110 metrics = [
91111 GENERAL_QUALITY_METRIC ,
@@ -94,21 +114,14 @@ def test_create_eval_run_data_source_evaluation_set(client):
94114 EXACT_MATCH_COMPUTATION_BASED_METRIC ,
95115 BLEU_COMPUTATION_BASED_METRIC ,
96116 ],
97- agent_info = types .evals .AgentInfo (
98- agent_resource_name = "project/123/locations/us-central1/reasoningEngines/456" ,
99- name = "agent-1" ,
100- instruction = "agent-1 instruction" ,
101- tool_declarations = [tool ],
102- ),
117+ agent_info = AGENT_INFO ,
103118 labels = {"label1" : "value1" },
104119 )
105120 assert isinstance (evaluation_run , types .EvaluationRun )
106121 assert evaluation_run .display_name == "test4"
107122 assert evaluation_run .state == types .EvaluationRunState .PENDING
108123 assert isinstance (evaluation_run .data_source , types .EvaluationRunDataSource )
109- assert evaluation_run .data_source .evaluation_set == (
110- "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
111- )
124+ assert evaluation_run .data_source .evaluation_set == EVAL_SET_NAME
112125 assert evaluation_run .evaluation_config == types .EvaluationRunConfig (
113126 output_config = genai_types .OutputConfig (
114127 gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
@@ -122,13 +135,13 @@ def test_create_eval_run_data_source_evaluation_set(client):
122135 ],
123136 )
124137 assert evaluation_run .inference_configs [
125- "agent-1"
138+ AGENT_INFO . name
126139 ] == types .EvaluationRunInferenceConfig (
127140 agent_config = types .EvaluationRunAgentConfig (
128141 developer_instruction = genai_types .Content (
129142 parts = [genai_types .Part (text = "agent-1 instruction" )]
130143 ),
131- tools = [tool ],
144+ tools = [TOOL ],
132145 )
133146 )
134147 assert evaluation_run .labels == {
@@ -190,14 +203,15 @@ def test_create_eval_run_with_inference_configs(client):
190203 """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs."""
191204 client ._api_client ._http_options .api_version = "v1beta1"
192205 inference_config = types .EvaluationRunInferenceConfig (
193- model = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
206+ model = MODEL_NAME ,
207+ prompt_template = types .EvaluationRunPromptTemplate (
208+ prompt_template = "test prompt template"
209+ ),
194210 )
195211 evaluation_run = client .evals .create_evaluation_run (
196212 name = "test_inference_config" ,
197213 display_name = "test_inference_config" ,
198- dataset = types .EvaluationRunDataSource (
199- evaluation_set = "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
200- ),
214+ dataset = types .EvaluationRunDataSource (evaluation_set = EVAL_SET_NAME ),
201215 dest = GCS_DEST ,
202216 metrics = [GENERAL_QUALITY_METRIC ],
203217 inference_configs = {"model_1" : inference_config },
@@ -207,9 +221,7 @@ def test_create_eval_run_with_inference_configs(client):
207221 assert evaluation_run .display_name == "test_inference_config"
208222 assert evaluation_run .state == types .EvaluationRunState .PENDING
209223 assert isinstance (evaluation_run .data_source , types .EvaluationRunDataSource )
210- assert evaluation_run .data_source .evaluation_set == (
211- "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
212- )
224+ assert evaluation_run .data_source .evaluation_set == EVAL_SET_NAME
213225 assert evaluation_run .evaluation_config == types .EvaluationRunConfig (
214226 output_config = genai_types .OutputConfig (
215227 gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
@@ -223,9 +235,11 @@ def test_create_eval_run_with_inference_configs(client):
223235 assert evaluation_run .error is None
224236
225237
226- # Test fails in replay mode because of UUID generation mismatch.
238+ # Dataframe tests fail in replay mode because of UUID generation mismatch.
227239# def test_create_eval_run_data_source_evaluation_dataset(client):
228- # """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
240+ # """Tests that create_evaluation_run() creates a correctly structured
241+ # EvaluationRun with EvaluationDataset.
242+ # """
229243# input_df = pd.DataFrame(
230244# {
231245# "prompt": ["prompt1", "prompt2"],
@@ -275,7 +289,7 @@ def test_create_eval_run_with_inference_configs(client):
275289# name="test6",
276290# display_name="test6",
277291# dataset=types.EvaluationDataset(
278- # candidate_name="candidate_1" ,
292+ # candidate_name=CANDIDATE_NAME ,
279293# eval_dataset_df=input_df,
280294# ),
281295# dest=GCS_DEST,
@@ -319,6 +333,196 @@ def test_create_eval_run_with_inference_configs(client):
319333# assert evaluation_run.error is None
320334
321335
336+ # def test_create_eval_run_data_source_evaluation_dataset_with_inference_configs_and_prompt_template_data(
337+ # client,
338+ # ):
339+ # """Tests that create_evaluation_run() creates a correctly structured
340+ # EvaluationRun with EvaluationDataset and inference_configs.
341+ # Prompt template data is inferred from the dataset and a default prompt
342+ # template should be used.
343+ # """
344+ # evaluation_run = client.evals.create_evaluation_run(
345+ # name="test9",
346+ # display_name="test9",
347+ # dataset=types.EvaluationDataset(
348+ # candidate_name=CANDIDATE_NAME,
349+ # eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
350+ # ),
351+ # dest=GCS_DEST,
352+ # metrics=[GENERAL_QUALITY_METRIC],
353+ # inference_configs={
354+ # CANDIDATE_NAME: types.EvaluationRunInferenceConfig(
355+ # model=MODEL_NAME,
356+ # )
357+ # },
358+ # )
359+ # assert isinstance(evaluation_run, types.EvaluationRun)
360+ # assert evaluation_run.display_name == "test9"
361+ # assert evaluation_run.state == types.EvaluationRunState.PENDING
362+ # assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
363+ # assert evaluation_run.inference_configs[
364+ # CANDIDATE_NAME
365+ # ] == types.EvaluationRunInferenceConfig(
366+ # model=MODEL_NAME,
367+ # prompt_template=types.EvaluationRunPromptTemplate(
368+ # prompt_template=DEFAULT_PROMPT_TEMPLATE
369+ # ),
370+ # )
371+ # # Check evaluation set
372+ # assert evaluation_run.data_source.evaluation_set
373+ # eval_set = client.evals.get_evaluation_set(
374+ # name=evaluation_run.data_source.evaluation_set
375+ # )
376+ # assert len(eval_set.evaluation_items) == 2
377+ # # Check evaluation items
378+ # for i, eval_item_name in enumerate(eval_set.evaluation_items):
379+ # eval_item = client.evals.get_evaluation_item(name=eval_item_name)
380+ # assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
381+ # assert (
382+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
383+ # "prompt"
384+ # ]
385+ # == genai_types.Content(
386+ # parts=[
387+ # genai_types.Part(
388+ # text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
389+ # )
390+ # ],
391+ # role="user",
392+ # )
393+ # )
394+ # assert (
395+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
396+ # "context"
397+ # ]
398+ # == genai_types.Content(
399+ # parts=[
400+ # genai_types.Part(
401+ # text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
402+ # )
403+ # ],
404+ # role="user",
405+ # )
406+ # )
407+ # assert (
408+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
409+ # "conversation_history"
410+ # ]
411+ # == genai_types.Content(
412+ # parts=[
413+ # genai_types.Part(
414+ # text=(
415+ # INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
416+ # "conversation_history"
417+ # ]
418+ # )
419+ # )
420+ # ],
421+ # role="user",
422+ # )
423+ # )
424+ # assert (
425+ # eval_item.evaluation_request.candidate_responses[0].text
426+ # == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
427+ # )
428+ # assert evaluation_run.error is None
429+
430+
431+ # def test_create_eval_run_data_source_evaluation_dataset_with_agent_info_and_prompt_template_data(
432+ # client,
433+ # ):
434+ # """Tests that create_evaluation_run() creates a correctly structured
435+ # EvaluationRun with EvaluationDataset and agent_info.
436+ # Prompt template data is inferred from the dataset and a default prompt
437+ # template should be used.
438+ # """
439+ # evaluation_run = client.evals.create_evaluation_run(
440+ # name="test9",
441+ # display_name="test9",
442+ # dataset=types.EvaluationDataset(
443+ # candidate_name=CANDIDATE_NAME,
444+ # eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
445+ # ),
446+ # dest=GCS_DEST,
447+ # metrics=[GENERAL_QUALITY_METRIC],
448+ # agent_info=AGENT_INFO,
449+ # )
450+ # assert isinstance(evaluation_run, types.EvaluationRun)
451+ # assert evaluation_run.display_name == "test9"
452+ # assert evaluation_run.state == types.EvaluationRunState.PENDING
453+ # assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
454+ # assert evaluation_run.inference_configs[
455+ # AGENT_INFO.name
456+ # ] == types.EvaluationRunInferenceConfig(
457+ # agent_config=types.EvaluationRunAgentConfig(
458+ # developer_instruction=genai_types.Content(
459+ # parts=[genai_types.Part(text=AGENT_INFO.instruction)]
460+ # ),
461+ # tools=[TOOL],
462+ # ),
463+ # prompt_template=types.EvaluationRunPromptTemplate(
464+ # prompt_template=DEFAULT_PROMPT_TEMPLATE
465+ # ),
466+ # )
467+ # # Check evaluation set
468+ # assert evaluation_run.data_source.evaluation_set
469+ # eval_set = client.evals.get_evaluation_set(
470+ # name=evaluation_run.data_source.evaluation_set
471+ # )
472+ # assert len(eval_set.evaluation_items) == 2
473+ # # Check evaluation items
474+ # for i, eval_item_name in enumerate(eval_set.evaluation_items):
475+ # eval_item = client.evals.get_evaluation_item(name=eval_item_name)
476+ # assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
477+ # assert (
478+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
479+ # "prompt"
480+ # ]
481+ # == genai_types.Content(
482+ # parts=[
483+ # genai_types.Part(
484+ # text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
485+ # )
486+ # ],
487+ # role="user",
488+ # )
489+ # )
490+ # assert (
491+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
492+ # "context"
493+ # ]
494+ # == genai_types.Content(
495+ # parts=[
496+ # genai_types.Part(
497+ # text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
498+ # )
499+ # ],
500+ # role="user",
501+ # )
502+ # )
503+ # assert (
504+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
505+ # "conversation_history"
506+ # ]
507+ # == genai_types.Content(
508+ # parts=[
509+ # genai_types.Part(
510+ # text=(
511+ # INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
512+ # "conversation_history"
513+ # ]
514+ # )
515+ # )
516+ # ],
517+ # role="user",
518+ # )
519+ # )
520+ # assert (
521+ # eval_item.evaluation_request.candidate_responses[0].text
522+ # == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
523+ # )
524+ # assert evaluation_run.error is None
525+
322526pytest_plugins = ("pytest_asyncio" ,)
323527
324528
@@ -371,14 +575,15 @@ async def test_create_eval_run_async_with_inference_configs(client):
371575 """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously."""
372576 client ._api_client ._http_options .api_version = "v1beta1"
373577 inference_config = types .EvaluationRunInferenceConfig (
374- model = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
578+ model = MODEL_NAME ,
579+ prompt_template = types .EvaluationRunPromptTemplate (
580+ prompt_template = "Test the {prompt}"
581+ ),
375582 )
376583 evaluation_run = await client .aio .evals .create_evaluation_run (
377584 name = "test_inference_config_async" ,
378585 display_name = "test_inference_config_async" ,
379- dataset = types .EvaluationRunDataSource (
380- evaluation_set = "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
381- ),
586+ dataset = types .EvaluationRunDataSource (evaluation_set = EVAL_SET_NAME ),
382587 dest = GCS_DEST ,
383588 metrics = [GENERAL_QUALITY_METRIC ],
384589 inference_configs = {"model_1" : inference_config },
@@ -388,9 +593,7 @@ async def test_create_eval_run_async_with_inference_configs(client):
388593 assert evaluation_run .display_name == "test_inference_config_async"
389594 assert evaluation_run .state == types .EvaluationRunState .PENDING
390595 assert isinstance (evaluation_run .data_source , types .EvaluationRunDataSource )
391- assert evaluation_run .data_source .evaluation_set == (
392- "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
393- )
596+ assert evaluation_run .data_source .evaluation_set == EVAL_SET_NAME
394597 assert evaluation_run .evaluation_config == types .EvaluationRunConfig (
395598 output_config = genai_types .OutputConfig (
396599 gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
0 commit comments