1818from vertexai import types
1919from google .genai import types as genai_types
2020import pytest
21+ import pandas as pd
2122
2223GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
2324GENERAL_QUALITY_METRIC = types .EvaluationRunMetric (
6364 )
6465 ),
6566)
66-
67+ TOOL = genai_types .Tool (
68+ function_declarations = [
69+ genai_types .FunctionDeclaration (
70+ name = "get_weather" ,
71+ description = "Get weather in a location" ,
72+ parameters = {
73+ "type" : "object" ,
74+ "properties" : {"location" : {"type" : "string" }},
75+ },
76+ )
77+ ]
78+ )
79+ AGENT_INFO = types .evals .AgentInfo (
80+ agent_resource_name = "projects/123/locations/us-central1/reasoningEngines/456" ,
81+ name = "agent-1" ,
82+ instruction = "agent-1 instruction" ,
83+ tool_declarations = [TOOL ],
84+ )
85+ DEFAULT_PROMPT_TEMPLATE = "{prompt}"
86+ INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd .DataFrame (
87+ {
88+ "prompt" : ["prompt1" , "prompt2" ],
89+ "reference" : ["reference1" , "reference2" ],
90+ "response" : ["response1" , "response2" ],
91+ "context" : ["context1" , "context2" ],
92+ "conversation_history" : ["history1" , "history2" ],
93+ }
94+ )
95+ CANDIDATE_NAME = "candidate_1"
96+ MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
97+ EVAL_SET_NAME = "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
6798
6899def test_create_eval_run_data_source_evaluation_set (client ):
69100 """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
70101 client ._api_client ._http_options .api_version = "v1beta1"
71- tool = genai_types .Tool (
72- function_declarations = [
73- genai_types .FunctionDeclaration (
74- name = "get_weather" ,
75- description = "Get weather in a location" ,
76- parameters = {
77- "type" : "object" ,
78- "properties" : {"location" : {"type" : "string" }},
79- },
80- )
81- ]
82- )
83102 evaluation_run = client .evals .create_evaluation_run (
84103 name = "test4" ,
85104 display_name = "test4" ,
86105 dataset = types .EvaluationRunDataSource (
87- evaluation_set = "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
106+ evaluation_set = EVAL_SET_NAME
88107 ),
89108 dest = GCS_DEST ,
90109 metrics = [
@@ -94,21 +113,14 @@ def test_create_eval_run_data_source_evaluation_set(client):
94113 EXACT_MATCH_COMPUTATION_BASED_METRIC ,
95114 BLEU_COMPUTATION_BASED_METRIC ,
96115 ],
97- agent_info = types .evals .AgentInfo (
98- agent_resource_name = "project/123/locations/us-central1/reasoningEngines/456" ,
99- name = "agent-1" ,
100- instruction = "agent-1 instruction" ,
101- tool_declarations = [tool ],
102- ),
116+ agent_info = AGENT_INFO ,
103117 labels = {"label1" : "value1" },
104118 )
105119 assert isinstance (evaluation_run , types .EvaluationRun )
106120 assert evaluation_run .display_name == "test4"
107121 assert evaluation_run .state == types .EvaluationRunState .PENDING
108122 assert isinstance (evaluation_run .data_source , types .EvaluationRunDataSource )
109- assert evaluation_run .data_source .evaluation_set == (
110- "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
111- )
123+ assert evaluation_run .data_source .evaluation_set == EVAL_SET_NAME
112124 assert evaluation_run .evaluation_config == types .EvaluationRunConfig (
113125 output_config = genai_types .OutputConfig (
114126 gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
@@ -122,13 +134,13 @@ def test_create_eval_run_data_source_evaluation_set(client):
122134 ],
123135 )
124136 assert evaluation_run .inference_configs [
125- "agent-1"
137+ AGENT_INFO . name
126138 ] == types .EvaluationRunInferenceConfig (
127139 agent_config = types .EvaluationRunAgentConfig (
128140 developer_instruction = genai_types .Content (
129141 parts = [genai_types .Part (text = "agent-1 instruction" )]
130142 ),
131- tools = [tool ],
143+ tools = [TOOL ],
132144 )
133145 )
134146 assert evaluation_run .labels == {
@@ -190,13 +202,16 @@ def test_create_eval_run_with_inference_configs(client):
190202 """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs."""
191203 client ._api_client ._http_options .api_version = "v1beta1"
192204 inference_config = types .EvaluationRunInferenceConfig (
193- model = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
205+ model = MODEL_NAME ,
206+ prompt_template = types .EvaluationRunPromptTemplate (
207+ prompt_template = "test prompt template"
208+ ),
194209 )
195210 evaluation_run = client .evals .create_evaluation_run (
196211 name = "test_inference_config" ,
197212 display_name = "test_inference_config" ,
198213 dataset = types .EvaluationRunDataSource (
199- evaluation_set = "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
214+ evaluation_set = EVAL_SET_NAME
200215 ),
201216 dest = GCS_DEST ,
202217 metrics = [GENERAL_QUALITY_METRIC ],
@@ -207,9 +222,7 @@ def test_create_eval_run_with_inference_configs(client):
207222 assert evaluation_run .display_name == "test_inference_config"
208223 assert evaluation_run .state == types .EvaluationRunState .PENDING
209224 assert isinstance (evaluation_run .data_source , types .EvaluationRunDataSource )
210- assert evaluation_run .data_source .evaluation_set == (
211- "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
212- )
225+ assert evaluation_run .data_source .evaluation_set == EVAL_SET_NAME
213226 assert evaluation_run .evaluation_config == types .EvaluationRunConfig (
214227 output_config = genai_types .OutputConfig (
215228 gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
@@ -223,9 +236,11 @@ def test_create_eval_run_with_inference_configs(client):
223236 assert evaluation_run .error is None
224237
225238
226- # Test fails in replay mode because of UUID generation mismatch.
239+ # Dataframe tests fail in replay mode because of UUID generation mismatch.
227240# def test_create_eval_run_data_source_evaluation_dataset(client):
228- # """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
241+ # """Tests that create_evaluation_run() creates a correctly structured
242+ # EvaluationRun with EvaluationDataset.
243+ # """
229244# input_df = pd.DataFrame(
230245# {
231246# "prompt": ["prompt1", "prompt2"],
@@ -275,7 +290,7 @@ def test_create_eval_run_with_inference_configs(client):
275290# name="test6",
276291# display_name="test6",
277292# dataset=types.EvaluationDataset(
278- # candidate_name="candidate_1" ,
293+ # candidate_name=CANDIDATE_NAME ,
279294# eval_dataset_df=input_df,
280295# ),
281296# dest=GCS_DEST,
@@ -319,6 +334,196 @@ def test_create_eval_run_with_inference_configs(client):
319334# assert evaluation_run.error is None
320335
321336
337+ # def test_create_eval_run_data_source_evaluation_dataset_with_inference_configs_and_prompt_template_data(
338+ # client,
339+ # ):
340+ # """Tests that create_evaluation_run() creates a correctly structured
341+ # EvaluationRun with EvaluationDataset and inference_configs.
342+ # Prompt template data is inferred from the dataset and a default prompt
343+ # template should be used.
344+ # """
345+ # evaluation_run = client.evals.create_evaluation_run(
346+ # name="test9",
347+ # display_name="test9",
348+ # dataset=types.EvaluationDataset(
349+ # candidate_name=CANDIDATE_NAME,
350+ # eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
351+ # ),
352+ # dest=GCS_DEST,
353+ # metrics=[GENERAL_QUALITY_METRIC],
354+ # inference_configs={
355+ # CANDIDATE_NAME: types.EvaluationRunInferenceConfig(
356+ # model=MODEL_NAME,
357+ # )
358+ # },
359+ # )
360+ # assert isinstance(evaluation_run, types.EvaluationRun)
361+ # assert evaluation_run.display_name == "test9"
362+ # assert evaluation_run.state == types.EvaluationRunState.PENDING
363+ # assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
364+ # assert evaluation_run.inference_configs[
365+ # CANDIDATE_NAME
366+ # ] == types.EvaluationRunInferenceConfig(
367+ # model=MODEL_NAME,
368+ # prompt_template=types.EvaluationRunPromptTemplate(
369+ # prompt_template=DEFAULT_PROMPT_TEMPLATE
370+ # ),
371+ # )
372+ # # Check evaluation set
373+ # assert evaluation_run.data_source.evaluation_set
374+ # eval_set = client.evals.get_evaluation_set(
375+ # name=evaluation_run.data_source.evaluation_set
376+ # )
377+ # assert len(eval_set.evaluation_items) == 2
378+ # # Check evaluation items
379+ # for i, eval_item_name in enumerate(eval_set.evaluation_items):
380+ # eval_item = client.evals.get_evaluation_item(name=eval_item_name)
381+ # assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
382+ # assert (
383+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
384+ # "prompt"
385+ # ]
386+ # == genai_types.Content(
387+ # parts=[
388+ # genai_types.Part(
389+ # text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
390+ # )
391+ # ],
392+ # role="user",
393+ # )
394+ # )
395+ # assert (
396+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
397+ # "context"
398+ # ]
399+ # == genai_types.Content(
400+ # parts=[
401+ # genai_types.Part(
402+ # text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
403+ # )
404+ # ],
405+ # role="user",
406+ # )
407+ # )
408+ # assert (
409+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
410+ # "conversation_history"
411+ # ]
412+ # == genai_types.Content(
413+ # parts=[
414+ # genai_types.Part(
415+ # text=(
416+ # INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
417+ # "conversation_history"
418+ # ]
419+ # )
420+ # )
421+ # ],
422+ # role="user",
423+ # )
424+ # )
425+ # assert (
426+ # eval_item.evaluation_request.candidate_responses[0].text
427+ # == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
428+ # )
429+ # assert evaluation_run.error is None
430+
431+
432+ # def test_create_eval_run_data_source_evaluation_dataset_with_agent_info_and_prompt_template_data(
433+ # client,
434+ # ):
435+ # """Tests that create_evaluation_run() creates a correctly structured
436+ # EvaluationRun with EvaluationDataset and agent_info.
437+ # Prompt template data is inferred from the dataset and a default prompt
438+ # template should be used.
439+ # """
440+ # evaluation_run = client.evals.create_evaluation_run(
441+ # name="test9",
442+ # display_name="test9",
443+ # dataset=types.EvaluationDataset(
444+ # candidate_name=CANDIDATE_NAME,
445+ # eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY,
446+ # ),
447+ # dest=GCS_DEST,
448+ # metrics=[GENERAL_QUALITY_METRIC],
449+ # agent_info=AGENT_INFO,
450+ # )
451+ # assert isinstance(evaluation_run, types.EvaluationRun)
452+ # assert evaluation_run.display_name == "test9"
453+ # assert evaluation_run.state == types.EvaluationRunState.PENDING
454+ # assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
455+ # assert evaluation_run.inference_configs[
456+ # AGENT_INFO.name
457+ # ] == types.EvaluationRunInferenceConfig(
458+ # agent_config=types.EvaluationRunAgentConfig(
459+ # developer_instruction=genai_types.Content(
460+ # parts=[genai_types.Part(text=AGENT_INFO.instruction)]
461+ # ),
462+ # tools=[TOOL],
463+ # ),
464+ # prompt_template=types.EvaluationRunPromptTemplate(
465+ # prompt_template=DEFAULT_PROMPT_TEMPLATE
466+ # ),
467+ # )
468+ # # Check evaluation set
469+ # assert evaluation_run.data_source.evaluation_set
470+ # eval_set = client.evals.get_evaluation_set(
471+ # name=evaluation_run.data_source.evaluation_set
472+ # )
473+ # assert len(eval_set.evaluation_items) == 2
474+ # # Check evaluation items
475+ # for i, eval_item_name in enumerate(eval_set.evaluation_items):
476+ # eval_item = client.evals.get_evaluation_item(name=eval_item_name)
477+ # assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
478+ # assert (
479+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
480+ # "prompt"
481+ # ]
482+ # == genai_types.Content(
483+ # parts=[
484+ # genai_types.Part(
485+ # text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["prompt"]
486+ # )
487+ # ],
488+ # role="user",
489+ # )
490+ # )
491+ # assert (
492+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
493+ # "context"
494+ # ]
495+ # == genai_types.Content(
496+ # parts=[
497+ # genai_types.Part(
498+ # text=INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["context"]
499+ # )
500+ # ],
501+ # role="user",
502+ # )
503+ # )
504+ # assert (
505+ # eval_item.evaluation_request.prompt.prompt_template_data.values[
506+ # "conversation_history"
507+ # ]
508+ # == genai_types.Content(
509+ # parts=[
510+ # genai_types.Part(
511+ # text=(
512+ # INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i][
513+ # "conversation_history"
514+ # ]
515+ # )
516+ # )
517+ # ],
518+ # role="user",
519+ # )
520+ # )
521+ # assert (
522+ # eval_item.evaluation_request.candidate_responses[0].text
523+ # == INPUT_DF_WITH_CONTEXT_AND_HISTORY.iloc[i]["response"]
524+ # )
525+ # assert evaluation_run.error is None
526+
322527pytest_plugins = ("pytest_asyncio" ,)
323528
324529
@@ -371,13 +576,16 @@ async def test_create_eval_run_async_with_inference_configs(client):
371576 """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously."""
372577 client ._api_client ._http_options .api_version = "v1beta1"
373578 inference_config = types .EvaluationRunInferenceConfig (
374- model = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
579+ model = MODEL_NAME ,
580+ prompt_template = types .EvaluationRunPromptTemplate (
581+ prompt_template = "Test the {prompt}"
582+ ),
375583 )
376584 evaluation_run = await client .aio .evals .create_evaluation_run (
377585 name = "test_inference_config_async" ,
378586 display_name = "test_inference_config_async" ,
379587 dataset = types .EvaluationRunDataSource (
380- evaluation_set = "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
588+ evaluation_set = EVAL_SET_NAME
381589 ),
382590 dest = GCS_DEST ,
383591 metrics = [GENERAL_QUALITY_METRIC ],
@@ -388,9 +596,7 @@ async def test_create_eval_run_async_with_inference_configs(client):
388596 assert evaluation_run .display_name == "test_inference_config_async"
389597 assert evaluation_run .state == types .EvaluationRunState .PENDING
390598 assert isinstance (evaluation_run .data_source , types .EvaluationRunDataSource )
391- assert evaluation_run .data_source .evaluation_set == (
392- "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
393- )
599+ assert evaluation_run .data_source .evaluation_set == EVAL_SET_NAME
394600 assert evaluation_run .evaluation_config == types .EvaluationRunConfig (
395601 output_config = genai_types .OutputConfig (
396602 gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
0 commit comments