From 7d6b6c091bfc8cfb854dba65e1125827415e2c15 Mon Sep 17 00:00:00 2001 From: Darren Cohen <39422044+dargilco@users.noreply.github.com> Date: Fri, 20 Mar 2026 11:54:05 -0700 Subject: [PATCH 1/7] modified sample --- .../modified_sample_evaluations_graders.py | 199 ++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py diff --git a/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py b/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py new file mode 100644 index 000000000000..2ad99b0a39a2 --- /dev/null +++ b/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py @@ -0,0 +1,199 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +DESCRIPTION: + Given an AIProjectClient, this sample demonstrates how to use the synchronous + `openai.evals.*` methods to create, get and list evaluation and eval runs + using various grader types (label_model, text_similarity, string_check, score_model). + +USAGE: + python sample_evaluations_graders.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0" python-dotenv + + Set these environment variables with your own values: + 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your + Microsoft Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. + 2) FOUNDRY_MODEL_NAME - Required. The name of the model deployment to use for evaluation. +""" + +import os +from pprint import pprint +from typing import Literal, TypedDict, Required +from dotenv import load_dotenv +from openai.types.graders import ( + LabelModelGraderParam, + ScoreModelGraderParam, + StringCheckGraderParam, + TextSimilarityGraderParam, +) +from openai.types.eval_create_params import DataSourceConfigCustom, TestingCriterion +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient + +# We need to define this TypedDict and others somewhere... +class AzureAIGraderCoherenceParam(TypedDict, total=False): + type: Required[Literal["azure_ai_evaluator"]] + """The object type, which is always `azure_ai_evaluator`.""" + name: Required[str] + """The name of the grader.""" + evaluator_name: str + initialization_parameters: dict + data_mapping: dict + + +load_dotenv() + +endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] +model_deployment_name = os.environ.get("FOUNDRY_MODEL_NAME", "") + +with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, + project_client.get_openai_client() as client, +): + + data_source_config = DataSourceConfigCustom( + { + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": {"type": "string"}, + "response": {"type": "string"}, + "context": {"type": "string"}, + "ground_truth": {"type": "string"}, + }, + "required": [], + }, + "include_sample_schema": True, + } + ) + + # Specified using TypedDict: + testing_criteria: list[TestingCriterion] = [ + LabelModelGraderParam( # We whave both this one in openai, as TypedDict, to be used as input. We also have LabelModelGrader as a "normal" class derived from base model with utils (to_dict, to_json) to be used as output. Don't use it here. + type="label_model", + model=model_deployment_name, + input=[ + { + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'", + }, + {"role": "user", "content": "Statement: {{item.query}}"}, + ], + passing_labels=["positive", "neutral"], + labels=["positive", "neutral", "negative"], + name="label_grader", + ), + TextSimilarityGraderParam( + type="text_similarity", + input="{{item.ground_truth}}", + evaluation_metric="bleu", + reference="{{item.response}}", + pass_threshold=1, + name="text_check_grader", + ), + StringCheckGraderParam( + type="string_check", + input="{{item.ground_truth}}", + reference="{{item.ground_truth}}", + operation="eq", + name="string_check_grader", + ), + ScoreModelGraderParam( + type="score_model", + name="score", + model=model_deployment_name, + input=[ + { + "role": "system", + "content": 'Evaluate the degree of similarity between the given output and the ground truth on a scale from 1 to 5, using a chain of thought to ensure step-by-step reasoning before reaching the conclusion.\n\nConsider the following criteria:\n\n- 5: Highly similar - The output and ground truth are nearly identical, with only minor, insignificant differences.\n- 4: Somewhat similar - The output is largely similar to the ground truth but has few noticeable differences.\n- 3: Moderately similar - There are some evident differences, but the core essence is captured in the output.\n- 2: Slightly similar - The output only captures a few elements of the ground truth and contains several differences.\n- 1: Not similar - The output is significantly different from the ground truth, with few or no matching elements.\n\n# Steps\n\n1. Identify and list the key elements present in both the output and the ground truth.\n2. Compare these key elements to evaluate their similarities and differences, considering both content and structure.\n3. Analyze the semantic meaning conveyed by both the output and the ground truth, noting any significant deviations.\n4. Based on these comparisons, categorize the level of similarity according to the defined criteria above.\n5. Write out the reasoning for why a particular score is chosen, to ensure transparency and correctness.\n6. Assign a similarity score based on the defined criteria above.\n\n# Output Format\n\nProvide the final similarity score as an integer (1, 2, 3, 4, or 5).\n\n# Examples\n\n**Example 1:**\n\n- Output: "The cat sat on the mat."\n- Ground Truth: "The feline is sitting on the rug."\n- Reasoning: Both sentences describe a cat sitting on a surface, but they use different wording. The structure is slightly different, but the core meaning is preserved. There are noticeable differences, but the overall meaning is conveyed well.\n- Similarity Score: 3\n\n**Example 2:**\n\n- Output: "The quick brown fox jumps over the lazy dog."\n- Ground Truth: "A fast brown animal leaps over a sleeping canine."\n- Reasoning: The meaning of both sentences is very similar, with only minor differences in wording. The structure and intent are well preserved.\n- Similarity Score: 4\n\n# Notes\n\n- Always aim to provide a fair and balanced assessment.\n- Consider both syntactic and semantic differences in your evaluation.\n- Consistency in scoring similar pairs is crucial for accurate measurement.', + }, + {"role": "user", "content": "Output: {{item.response}}}}\nGround Truth: {{item.ground_truth}}"}, + ], + image_tag="2025-05-08", + pass_threshold=0.5, + ), + # We need to add TypedDict like this: + AzureAIGraderCoherenceParam( + type="azure_ai_evaluator", + name="coherence", + evaluator_name="builtin.coherence", + initialization_parameters={"deployment_name": f"{model_deployment_name}"}, + data_mapping={"query": "{{item.query}}", "response": "{{item.response}}"}, + ) + ] + + """ + testing_criteria = [ + { + "type": "label_model", + "model": model_deployment_name, + "input": [ + { + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'", + # TODO: Why is type="message" missing? + }, + {"role": "user", "content": "Statement: {{item.query}}"}, + ], + "passing_labels": ["positive", "neutral"], + "labels": ["positive", "neutral", "negative"], + "name": "label_grader", + }, + { + "type": "text_similarity", + "input": "{{item.ground_truth}}", + "evaluation_metric": "bleu", + "reference": "{{item.response}}", + "pass_threshold": 1, + "name": "text_check_grader", + }, + { + "type": "string_check", + "input": "{{item.ground_truth}}", + "reference": "{{item.ground_truth}}", + "operation": "eq", + "name": "string_check_grader", + }, + { + "type": "score_model", + "name": "score", + "model": model_deployment_name, + "input": [ + { + "role": "system", + "content": 'Evaluate the degree of similarity between the given output and the ground truth on a scale from 1 to 5, using a chain of thought to ensure step-by-step reasoning before reaching the conclusion.\n\nConsider the following criteria:\n\n- 5: Highly similar - The output and ground truth are nearly identical, with only minor, insignificant differences.\n- 4: Somewhat similar - The output is largely similar to the ground truth but has few noticeable differences.\n- 3: Moderately similar - There are some evident differences, but the core essence is captured in the output.\n- 2: Slightly similar - The output only captures a few elements of the ground truth and contains several differences.\n- 1: Not similar - The output is significantly different from the ground truth, with few or no matching elements.\n\n# Steps\n\n1. Identify and list the key elements present in both the output and the ground truth.\n2. Compare these key elements to evaluate their similarities and differences, considering both content and structure.\n3. Analyze the semantic meaning conveyed by both the output and the ground truth, noting any significant deviations.\n4. Based on these comparisons, categorize the level of similarity according to the defined criteria above.\n5. Write out the reasoning for why a particular score is chosen, to ensure transparency and correctness.\n6. Assign a similarity score based on the defined criteria above.\n\n# Output Format\n\nProvide the final similarity score as an integer (1, 2, 3, 4, or 5).\n\n# Examples\n\n**Example 1:**\n\n- Output: "The cat sat on the mat."\n- Ground Truth: "The feline is sitting on the rug."\n- Reasoning: Both sentences describe a cat sitting on a surface, but they use different wording. The structure is slightly different, but the core meaning is preserved. There are noticeable differences, but the overall meaning is conveyed well.\n- Similarity Score: 3\n\n**Example 2:**\n\n- Output: "The quick brown fox jumps over the lazy dog."\n- Ground Truth: "A fast brown animal leaps over a sleeping canine."\n- Reasoning: The meaning of both sentences is very similar, with only minor differences in wording. The structure and intent are well preserved.\n- Similarity Score: 4\n\n# Notes\n\n- Always aim to provide a fair and balanced assessment.\n- Consider both syntactic and semantic differences in your evaluation.\n- Consistency in scoring similar pairs is crucial for accurate measurement.', + }, + {"role": "user", "content": "Output: {{item.response}}}}\nGround Truth: {{item.ground_truth}}"}, + ], + "image_tag": "2025-05-08", + "pass_threshold": 0.5, + }, + { # Added by Darren + "type": "azure_ai_evaluator", + "name": "coherence", + "evaluator_name": "builtin.coherence", + "initialization_parameters": {"deployment_name": f"{model_deployment_name}"}, + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}, + } + ] + """ + + print("Creating evaluation with graders") + eval_object = client.evals.create( + name="OpenAI graders test", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") + + client.evals.delete(eval_id=eval_object.id) + print("Evaluation deleted") From 39439b4d1f8f648ec530fb49c17801991f29161e Mon Sep 17 00:00:00 2001 From: Darren Cohen <39422044+dargilco@users.noreply.github.com> Date: Fri, 20 Mar 2026 14:38:52 -0700 Subject: [PATCH 2/7] add in a partial stub for create (#45831) Co-authored-by: Kashif Khan --- .../azure/ai/projects/_patch.py | 2 +- .../azure/ai/projects/_patch.pyi | 60 +++++++++++++++++++ .../modified_sample_evaluations_graders.py | 34 +++++------ 3 files changed, 78 insertions(+), 18 deletions(-) create mode 100644 sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.py b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.py index ec64b8f51fc8..970952187677 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.py @@ -97,7 +97,7 @@ def __init__( self.telemetry = TelemetryOperations(self) # type: ignore @distributed_trace - def get_openai_client(self, **kwargs: Any) -> "OpenAI": # type: ignore[name-defined] # pylint: disable=too-many-statements + def get_openai_client(self, **kwargs: Any) -> OpenAI: # type: ignore[name-defined] # pylint: disable=too-many-statements """Get an authenticated OpenAI client from the `openai` package. Keyword arguments are passed to the OpenAI client constructor. diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi new file mode 100644 index 000000000000..f4d024407055 --- /dev/null +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi @@ -0,0 +1,60 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ +"""Type stub for _patch.py. + +Overrides get_openai_client() return type so that evals.create() accepts +Azure-specific grader types in addition to the standard OpenAI graders. +""" + +from typing import Any, Dict, Iterable, Union, Optional +from typing_extensions import Literal, Required, TypedDict + +import httpx +from openai import OpenAI as OpenAIClient +from openai._types import Body, Omit, Query, Headers, NotGiven +from openai.resources.evals.evals import Evals +from openai.types.eval_create_params import DataSourceConfig, TestingCriterion +from openai.types.eval_create_response import EvalCreateResponse +from openai.types.shared_params.metadata import Metadata +from openai.types.graders import ( + LabelModelGraderParam, + StringCheckGraderParam, + TextSimilarityGraderParam, + PythonGraderParam, + ScoreModelGraderParam, +) + +from ._client import AIProjectClient as AIProjectClientGenerated + +class AzureAIGraderCoherenceParam(TypedDict, total=False): + type: Required[Literal["azure_ai_evaluator"]] + name: Required[str] + evaluator_name: str + initialization_parameters: Dict[str, str] + data_mapping: Dict[str, str] + +class _AzureEvals(Evals): + def create( + self, + *, + data_source_config: DataSourceConfig, + testing_criteria: Iterable[Union[ + TestingCriterion, + AzureAIGraderCoherenceParam, + ]], + metadata: Optional[Metadata] | Omit = ..., + name: str | Omit = ..., + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = ..., + ) -> EvalCreateResponse: ... + +class OpenAI(OpenAIClient): + @property + def evals(self) -> _AzureEvals: ... + +class AIProjectClient(AIProjectClientGenerated): + def get_openai_client(self, **kwargs: Any) -> OpenAI: ... diff --git a/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py b/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py index 2ad99b0a39a2..9dd4e46f0ac2 100644 --- a/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py +++ b/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py @@ -24,31 +24,19 @@ """ import os -from pprint import pprint -from typing import Literal, TypedDict, Required -from dotenv import load_dotenv +from dotenv import load_dotenv, find_dotenv from openai.types.graders import ( LabelModelGraderParam, ScoreModelGraderParam, StringCheckGraderParam, TextSimilarityGraderParam, ) -from openai.types.eval_create_params import DataSourceConfigCustom, TestingCriterion +from openai.types.eval_create_params import DataSourceConfigCustom from azure.identity import DefaultAzureCredential from azure.ai.projects import AIProjectClient +from azure.ai.projects._patch import AzureAIGraderCoherenceParam -# We need to define this TypedDict and others somewhere... -class AzureAIGraderCoherenceParam(TypedDict, total=False): - type: Required[Literal["azure_ai_evaluator"]] - """The object type, which is always `azure_ai_evaluator`.""" - name: Required[str] - """The name of the grader.""" - evaluator_name: str - initialization_parameters: dict - data_mapping: dict - - -load_dotenv() +load_dotenv(find_dotenv()) endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] model_deployment_name = os.environ.get("FOUNDRY_MODEL_NAME", "") @@ -77,7 +65,7 @@ class AzureAIGraderCoherenceParam(TypedDict, total=False): ) # Specified using TypedDict: - testing_criteria: list[TestingCriterion] = [ + testing_criteria = [ LabelModelGraderParam( # We whave both this one in openai, as TypedDict, to be used as input. We also have LabelModelGrader as a "normal" class derived from base model with utils (to_dict, to_json) to be used as output. Don't use it here. type="label_model", model=model_deployment_name, @@ -193,6 +181,18 @@ class AzureAIGraderCoherenceParam(TypedDict, total=False): data_source_config=data_source_config, testing_criteria=testing_criteria, ) + eval_object = client.evals.create( + name="OpenAI graders test", + data_source_config=data_source_config, + testing_criteria=[ + AzureAIGraderCoherenceParam( + type="azure_ai_evaluator", + name="coherence", + evaluator_name="builtin.coherence", + initialization_parameters={"deployment_name": f"{model_deployment_name}"}, + data_mapping={"query": "{{item.query}}", "response": "{{item.response}}"}, + )], + ) print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") client.evals.delete(eval_id=eval_object.id) From ad17463d84cb121f0f7b2e76e6e2a3bcd4cb23cc Mon Sep 17 00:00:00 2001 From: Darren Cohen <39422044+dargilco@users.noreply.github.com> Date: Fri, 20 Mar 2026 15:24:09 -0700 Subject: [PATCH 3/7] Run 'black' --- .../azure/ai/projects/_patch.pyi | 10 ++++++---- .../modified_sample_evaluations_graders.py | 17 +++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi index f4d024407055..0270a523157f 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi @@ -40,10 +40,12 @@ class _AzureEvals(Evals): self, *, data_source_config: DataSourceConfig, - testing_criteria: Iterable[Union[ - TestingCriterion, - AzureAIGraderCoherenceParam, - ]], + testing_criteria: Iterable[ + Union[ + TestingCriterion, + AzureAIGraderCoherenceParam, + ] + ], metadata: Optional[Metadata] | Omit = ..., name: str | Omit = ..., extra_headers: Headers | None = None, diff --git a/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py b/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py index 9dd4e46f0ac2..273c6d33acb9 100644 --- a/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py +++ b/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py @@ -66,7 +66,7 @@ # Specified using TypedDict: testing_criteria = [ - LabelModelGraderParam( # We whave both this one in openai, as TypedDict, to be used as input. We also have LabelModelGrader as a "normal" class derived from base model with utils (to_dict, to_json) to be used as output. Don't use it here. + LabelModelGraderParam( # We whave both this one in openai, as TypedDict, to be used as input. We also have LabelModelGrader as a "normal" class derived from base model with utils (to_dict, to_json) to be used as output. Don't use it here. type="label_model", model=model_deployment_name, input=[ @@ -116,7 +116,7 @@ evaluator_name="builtin.coherence", initialization_parameters={"deployment_name": f"{model_deployment_name}"}, data_mapping={"query": "{{item.query}}", "response": "{{item.response}}"}, - ) + ), ] """ @@ -186,12 +186,13 @@ data_source_config=data_source_config, testing_criteria=[ AzureAIGraderCoherenceParam( - type="azure_ai_evaluator", - name="coherence", - evaluator_name="builtin.coherence", - initialization_parameters={"deployment_name": f"{model_deployment_name}"}, - data_mapping={"query": "{{item.query}}", "response": "{{item.response}}"}, - )], + type="azure_ai_evaluator", + name="coherence", + evaluator_name="builtin.coherence", + initialization_parameters={"deployment_name": f"{model_deployment_name}"}, + data_mapping={"query": "{{item.query}}", "response": "{{item.response}}"}, + ) + ], ) print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") From 1877a4c43a7434fc39e8afcb575282bfdce73a2b Mon Sep 17 00:00:00 2001 From: Darren Cohen <39422044+dargilco@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:29:12 -0700 Subject: [PATCH 4/7] More --- .../azure/ai/projects/_patch.pyi | 32 +--- .../azure/ai/projects/models/_patch.py | 4 +- .../ai/projects/models/_patch_typedict.py | 42 +++++ .../sample_evaluations_graders.alt.py} | 178 +++++++++--------- .../evaluations/sample_evaluations_graders.py | 2 +- .../tests/samples/test_samples_evaluations.py | 1 + 6 files changed, 148 insertions(+), 111 deletions(-) create mode 100644 sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typedict.py rename sdk/ai/azure-ai-projects/{typing-experiments/modified_sample_evaluations_graders.py => samples/evaluations/sample_evaluations_graders.alt.py} (53%) diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi index 0270a523157f..8a792d7b40ac 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi @@ -8,32 +8,16 @@ Overrides get_openai_client() return type so that evals.create() accepts Azure-specific grader types in addition to the standard OpenAI graders. """ -from typing import Any, Dict, Iterable, Union, Optional -from typing_extensions import Literal, Required, TypedDict - -import httpx -from openai import OpenAI as OpenAIClient -from openai._types import Body, Omit, Query, Headers, NotGiven +from typing import Any, Iterable, Union, Optional +from httpx import Timeout +from openai import NotGiven, Omit, OpenAI as OpenAIClient +from openai._types import Body, Query, Headers from openai.resources.evals.evals import Evals from openai.types.eval_create_params import DataSourceConfig, TestingCriterion from openai.types.eval_create_response import EvalCreateResponse from openai.types.shared_params.metadata import Metadata -from openai.types.graders import ( - LabelModelGraderParam, - StringCheckGraderParam, - TextSimilarityGraderParam, - PythonGraderParam, - ScoreModelGraderParam, -) - from ._client import AIProjectClient as AIProjectClientGenerated - -class AzureAIGraderCoherenceParam(TypedDict, total=False): - type: Required[Literal["azure_ai_evaluator"]] - name: Required[str] - evaluator_name: str - initialization_parameters: Dict[str, str] - data_mapping: Dict[str, str] +from .models import EvalGraderAzureAIEvaluator class _AzureEvals(Evals): def create( @@ -43,15 +27,15 @@ class _AzureEvals(Evals): testing_criteria: Iterable[ Union[ TestingCriterion, - AzureAIGraderCoherenceParam, + EvalGraderAzureAIEvaluator, ] ], - metadata: Optional[Metadata] | Omit = ..., + metadata: Optional[Metadata] | Omit | None = ..., name: str | Omit = ..., extra_headers: Headers | None = None, extra_query: Query | None = None, extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = ..., + timeout: float | Timeout | NotGiven | None = ..., ) -> EvalCreateResponse: ... class OpenAI(OpenAIClient): diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py index 5192c5315597..1bd54dcc50ae 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py @@ -9,6 +9,7 @@ """ from typing import List, Dict, Mapping, Optional, Any, Tuple +from ._patch_typedict import EvalGraderAzureAIEvaluator from azure.core.polling import LROPoller, AsyncLROPoller, PollingMethod, AsyncPollingMethod from azure.core.polling.base_polling import ( LROBasePolling, @@ -346,9 +347,10 @@ def from_continuation_token( __all__: List[str] = [ + "AsyncUpdateMemoriesLROPoller", "CustomCredential", + "EvalGraderAzureAIEvaluator", "UpdateMemoriesLROPoller", - "AsyncUpdateMemoriesLROPoller", ] # Add all objects you want publicly available to users at this package level diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typedict.py b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typedict.py new file mode 100644 index 000000000000..bc82619042ed --- /dev/null +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typedict.py @@ -0,0 +1,42 @@ +# pylint: disable=line-too-long,useless-suppression +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +from typing import Dict, Any +from typing_extensions import Literal, Required, TypedDict + + +class EvalGraderAzureAIEvaluator(TypedDict, total=False): + """AzureAIEvaluatorGrader. + + :ivar type: The object type, which is always ``azure_ai_evaluator``. Required. Default value is + "azure_ai_evaluator". + :vartype type: str + :ivar name: The name of the grader. Required. + :vartype name: str + :ivar evaluator_name: The name of the evaluator. Required. + :vartype evaluator_name: str + :ivar evaluator_version: The version of the evaluator. Latest version if not specified. + :vartype evaluator_version: str + :ivar initialization_parameters: The initialization parameters for the evaluation. Must support + structured outputs. + :vartype initialization_parameters: dict[str, any] + :ivar data_mapping: The model to use for the evaluation. Must support structured outputs. + :vartype data_mapping: dict[str, str] + """ + + type: Required[Literal["azure_ai_evaluator"]] + """The object type, which is always ``azure_ai_evaluator``. Required. Default value is + \"azure_ai_evaluator\".""" + name: Required[str] + """The name of the grader. Required.""" + evaluator_name: Required[str] + """The name of the evaluator. Required.""" + evaluator_version: str + """The version of the evaluator. Latest version if not specified.""" + initialization_parameters: Dict[str, Any] + """The initialization parameters for the evaluation. Must support structured outputs.""" + data_mapping: Dict[str, str] + """The model to use for the evaluation. Must support structured outputs.""" diff --git a/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.alt.py similarity index 53% rename from sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py rename to sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.alt.py index 273c6d33acb9..aff85c5b5148 100644 --- a/sdk/ai/azure-ai-projects/typing-experiments/modified_sample_evaluations_graders.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.alt.py @@ -24,19 +24,30 @@ """ import os -from dotenv import load_dotenv, find_dotenv -from openai.types.graders import ( - LabelModelGraderParam, - ScoreModelGraderParam, - StringCheckGraderParam, - TextSimilarityGraderParam, + +import time +from pprint import pprint +from typing import List, Union +from dotenv import load_dotenv +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent, +) +from openai.types.graders import StringCheckGraderParam +from openai.types.eval_create_params import ( + DataSourceConfigCustom, + TestingCriterion, + TestingCriterionLabelModel, + TestingCriterionTextSimilarity, + TestingCriterionScoreModel, ) from openai.types.eval_create_params import DataSourceConfigCustom from azure.identity import DefaultAzureCredential from azure.ai.projects import AIProjectClient -from azure.ai.projects._patch import AzureAIGraderCoherenceParam +from azure.ai.projects.models import EvalGraderAzureAIEvaluator -load_dotenv(find_dotenv()) +load_dotenv() endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] model_deployment_name = os.environ.get("FOUNDRY_MODEL_NAME", "") @@ -64,9 +75,8 @@ } ) - # Specified using TypedDict: - testing_criteria = [ - LabelModelGraderParam( # We whave both this one in openai, as TypedDict, to be used as input. We also have LabelModelGrader as a "normal" class derived from base model with utils (to_dict, to_json) to be used as output. Don't use it here. + testing_criteria: List[Union[TestingCriterion, EvalGraderAzureAIEvaluator]] = [ + TestingCriterionLabelModel( type="label_model", model=model_deployment_name, input=[ @@ -80,7 +90,7 @@ labels=["positive", "neutral", "negative"], name="label_grader", ), - TextSimilarityGraderParam( + TestingCriterionTextSimilarity( type="text_similarity", input="{{item.ground_truth}}", evaluation_metric="bleu", @@ -95,7 +105,7 @@ operation="eq", name="string_check_grader", ), - ScoreModelGraderParam( + TestingCriterionScoreModel( type="score_model", name="score", model=model_deployment_name, @@ -106,11 +116,9 @@ }, {"role": "user", "content": "Output: {{item.response}}}}\nGround Truth: {{item.ground_truth}}"}, ], - image_tag="2025-05-08", pass_threshold=0.5, ), - # We need to add TypedDict like this: - AzureAIGraderCoherenceParam( + EvalGraderAzureAIEvaluator( type="azure_ai_evaluator", name="coherence", evaluator_name="builtin.coherence", @@ -119,82 +127,82 @@ ), ] - """ - testing_criteria = [ - { - "type": "label_model", - "model": model_deployment_name, - "input": [ - { - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'", - # TODO: Why is type="message" missing? - }, - {"role": "user", "content": "Statement: {{item.query}}"}, - ], - "passing_labels": ["positive", "neutral"], - "labels": ["positive", "neutral", "negative"], - "name": "label_grader", - }, - { - "type": "text_similarity", - "input": "{{item.ground_truth}}", - "evaluation_metric": "bleu", - "reference": "{{item.response}}", - "pass_threshold": 1, - "name": "text_check_grader", - }, - { - "type": "string_check", - "input": "{{item.ground_truth}}", - "reference": "{{item.ground_truth}}", - "operation": "eq", - "name": "string_check_grader", - }, - { - "type": "score_model", - "name": "score", - "model": model_deployment_name, - "input": [ - { - "role": "system", - "content": 'Evaluate the degree of similarity between the given output and the ground truth on a scale from 1 to 5, using a chain of thought to ensure step-by-step reasoning before reaching the conclusion.\n\nConsider the following criteria:\n\n- 5: Highly similar - The output and ground truth are nearly identical, with only minor, insignificant differences.\n- 4: Somewhat similar - The output is largely similar to the ground truth but has few noticeable differences.\n- 3: Moderately similar - There are some evident differences, but the core essence is captured in the output.\n- 2: Slightly similar - The output only captures a few elements of the ground truth and contains several differences.\n- 1: Not similar - The output is significantly different from the ground truth, with few or no matching elements.\n\n# Steps\n\n1. Identify and list the key elements present in both the output and the ground truth.\n2. Compare these key elements to evaluate their similarities and differences, considering both content and structure.\n3. Analyze the semantic meaning conveyed by both the output and the ground truth, noting any significant deviations.\n4. Based on these comparisons, categorize the level of similarity according to the defined criteria above.\n5. Write out the reasoning for why a particular score is chosen, to ensure transparency and correctness.\n6. Assign a similarity score based on the defined criteria above.\n\n# Output Format\n\nProvide the final similarity score as an integer (1, 2, 3, 4, or 5).\n\n# Examples\n\n**Example 1:**\n\n- Output: "The cat sat on the mat."\n- Ground Truth: "The feline is sitting on the rug."\n- Reasoning: Both sentences describe a cat sitting on a surface, but they use different wording. The structure is slightly different, but the core meaning is preserved. There are noticeable differences, but the overall meaning is conveyed well.\n- Similarity Score: 3\n\n**Example 2:**\n\n- Output: "The quick brown fox jumps over the lazy dog."\n- Ground Truth: "A fast brown animal leaps over a sleeping canine."\n- Reasoning: The meaning of both sentences is very similar, with only minor differences in wording. The structure and intent are well preserved.\n- Similarity Score: 4\n\n# Notes\n\n- Always aim to provide a fair and balanced assessment.\n- Consider both syntactic and semantic differences in your evaluation.\n- Consistency in scoring similar pairs is crucial for accurate measurement.', - }, - {"role": "user", "content": "Output: {{item.response}}}}\nGround Truth: {{item.ground_truth}}"}, - ], - "image_tag": "2025-05-08", - "pass_threshold": 0.5, - }, - { # Added by Darren - "type": "azure_ai_evaluator", - "name": "coherence", - "evaluator_name": "builtin.coherence", - "initialization_parameters": {"deployment_name": f"{model_deployment_name}"}, - "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}, - } - ] - """ - print("Creating evaluation with graders") eval_object = client.evals.create( name="OpenAI graders test", data_source_config=data_source_config, testing_criteria=testing_criteria, ) - eval_object = client.evals.create( - name="OpenAI graders test", - data_source_config=data_source_config, - testing_criteria=[ - AzureAIGraderCoherenceParam( - type="azure_ai_evaluator", - name="coherence", - evaluator_name="builtin.coherence", - initialization_parameters={"deployment_name": f"{model_deployment_name}"}, - data_mapping={"query": "{{item.query}}", "response": "{{item.response}}"}, - ) - ], - ) print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") + print("Get evaluation by Id") + eval_object_response = client.evals.retrieve(eval_object.id) + print("Evaluation Response:") + pprint(eval_object_response) + + print("Creating Eval Run with inline data") + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_graders_run", + metadata={"team": "eval-exp", "scenario": "graders-inline-v1"}, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content=[ + SourceFileContentContent( + item={ + "query": "I love this product! It works great.", + "context": "Product review context", + "ground_truth": "The product is excellent and performs well.", + "response": "The product is amazing and works perfectly.", + } + ), + SourceFileContentContent( + item={ + "query": "The weather is cloudy today.", + "context": "Weather observation", + "ground_truth": "Today's weather is overcast.", + "response": "The sky is covered with clouds today.", + } + ), + SourceFileContentContent( + item={ + "query": "What is the capital of France?", + "context": "Geography question about European capitals", + "ground_truth": "Paris", + "response": "The capital of France is Paris.", + } + ), + SourceFileContentContent( + item={ + "query": "Explain quantum computing", + "context": "Complex scientific concept explanation", + "ground_truth": "Quantum computing uses quantum mechanics principles", + "response": "Quantum computing leverages quantum mechanical phenomena like superposition and entanglement to process information.", + } + ), + ], + ), + ), + ) + print(f"Eval Run created (id: {eval_run_object.id}, name: {eval_run_object.name})") + pprint(eval_run_object) + + print("Get Eval Run by Id") + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + print("Eval Run Response:") + pprint(eval_run_response) + + while True: + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status in ("completed", "failed"): + output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) + pprint(output_items) + print(f"Eval Run Report URL: {run.report_url}") + break + time.sleep(5) + print("Waiting for eval run to complete...") + client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py index f04644142485..9a60ca4c65ef 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py @@ -107,7 +107,7 @@ }, {"role": "user", "content": "Output: {{item.response}}}}\nGround Truth: {{item.ground_truth}}"}, ], - "image_tag": "2025-05-08", + "image_tag": "2025-05-08", # What is this doing here? OpenAI only defines this for the PythonGrader (type="python") "pass_threshold": 0.5, }, ] diff --git a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py index bb68d1015730..c797f8e1e87c 100644 --- a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py +++ b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py @@ -173,6 +173,7 @@ class TestSamplesEvaluations(AzureRecordedTestCase): "sample_eval_catalog_prompt_based_evaluators.py", # For some reason fails with 500 (Internal server error) "sample_eval_upload_custom_evaluator.py", # TODO: Need to add recordings "sample_eval_upload_friendly_evaluator.py", # TODO: Need to add recordings + "sample_evaluations_graders.alt.py", # Need to remove "image_tag": "2025-05-08" from the recordings, once .alt.py relplaces the original ], ), ) From 9ef4b8e564279a6ea497abb0dda3a7d1dbbb6015 Mon Sep 17 00:00:00 2001 From: Darren Cohen <39422044+dargilco@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:36:43 -0700 Subject: [PATCH 5/7] rename --- sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py | 2 +- .../models/{_patch_typedict.py => _patch_typeddicts.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename sdk/ai/azure-ai-projects/azure/ai/projects/models/{_patch_typedict.py => _patch_typeddicts.py} (100%) diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py index 1bd54dcc50ae..395b8c7e0523 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py @@ -9,7 +9,7 @@ """ from typing import List, Dict, Mapping, Optional, Any, Tuple -from ._patch_typedict import EvalGraderAzureAIEvaluator +from ._patch_typeddicts import EvalGraderAzureAIEvaluator from azure.core.polling import LROPoller, AsyncLROPoller, PollingMethod, AsyncPollingMethod from azure.core.polling.base_polling import ( LROBasePolling, diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typedict.py b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typeddicts.py similarity index 100% rename from sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typedict.py rename to sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typeddicts.py From a85d7708a5d439bbb27ed5104493e28dcc8f0903 Mon Sep 17 00:00:00 2001 From: Darren Cohen <39422044+dargilco@users.noreply.github.com> Date: Thu, 26 Mar 2026 19:52:16 -0700 Subject: [PATCH 6/7] Another sample --- .../azure/ai/projects/_patch.pyi | 23 ++- .../azure/ai/projects/models/_patch.py | 4 +- .../ai/projects/models/_patch_typeddicts.py | 62 ++++++- .../sample_agent_evaluation.alt.py | 155 ++++++++++++++++++ .../evaluations/sample_evaluations_graders.py | 2 +- .../tests/samples/test_samples_evaluations.py | 3 +- 6 files changed, 243 insertions(+), 6 deletions(-) create mode 100644 sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_evaluation.alt.py diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi index 8a792d7b40ac..c91f72c5ee2d 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi @@ -13,11 +13,28 @@ from httpx import Timeout from openai import NotGiven, Omit, OpenAI as OpenAIClient from openai._types import Body, Query, Headers from openai.resources.evals.evals import Evals +from openai.resources.evals.runs.runs import Runs +from openai.types.evals.run_create_params import DataSource +from openai.types.evals.run_create_response import RunCreateResponse from openai.types.eval_create_params import DataSourceConfig, TestingCriterion from openai.types.eval_create_response import EvalCreateResponse from openai.types.shared_params.metadata import Metadata from ._client import AIProjectClient as AIProjectClientGenerated -from .models import EvalGraderAzureAIEvaluator +from .models import EvalGraderAzureAIEvaluator, TargetCompletionEvalRunDataSource + +class _AzureEvalRuns(Runs): + def create( + self, + eval_id: str, + *, + data_source: Union[DataSource, TargetCompletionEvalRunDataSource], # <=== Azure extention here + metadata: Optional[Metadata] | Omit = ..., + name: str | Omit = ..., + extra_headers: Headers | None = ..., + extra_query: Query | None = ..., + extra_body: Body | None = ..., + timeout: float | Timeout | None | NotGiven = ..., + ) -> RunCreateResponse: ... class _AzureEvals(Evals): def create( @@ -27,7 +44,7 @@ class _AzureEvals(Evals): testing_criteria: Iterable[ Union[ TestingCriterion, - EvalGraderAzureAIEvaluator, + EvalGraderAzureAIEvaluator, # <=== Azure extention here ] ], metadata: Optional[Metadata] | Omit | None = ..., @@ -37,6 +54,8 @@ class _AzureEvals(Evals): extra_body: Body | None = None, timeout: float | Timeout | NotGiven | None = ..., ) -> EvalCreateResponse: ... + @property + def runs(self) -> _AzureEvalRuns: ... class OpenAI(OpenAIClient): @property diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py index e05a9c2c0a88..281687232bdf 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py @@ -9,7 +9,7 @@ """ from typing import Final, FrozenSet, List, Dict, Mapping, Optional, Any, Tuple -from ._patch_typeddicts import EvalGraderAzureAIEvaluator +from ._patch_typeddicts import AzureAIAgentTarget, EvalGraderAzureAIEvaluator, TargetCompletionEvalRunDataSource from azure.core.polling import LROPoller, AsyncLROPoller, PollingMethod, AsyncPollingMethod from azure.core.polling.base_polling import ( LROBasePolling, @@ -348,8 +348,10 @@ def from_continuation_token( __all__: List[str] = [ "AsyncUpdateMemoriesLROPoller", + "AzureAIAgentTarget", "CustomCredential", "EvalGraderAzureAIEvaluator", + "TargetCompletionEvalRunDataSource", "UpdateMemoriesLROPoller", ] # Add all objects you want publicly available to users at this package level diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typeddicts.py b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typeddicts.py index bc82619042ed..62b82d199d20 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typeddicts.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typeddicts.py @@ -4,8 +4,68 @@ # Licensed under the MIT License. # ------------------------------------ -from typing import Dict, Any +from typing import Dict, Any, List, Union from typing_extensions import Literal, Required, TypedDict +from openai.types.evals.create_eval_completions_run_data_source_param import ( + InputMessagesItemReference, + SourceFileContent, + SourceFileID, +) +from ._models import ToolDescription + + +class AzureAIAgentTarget(TypedDict, total=False): + """Represents a target specifying an Azure AI agent. + + :ivar type: The type of target, always ``azure_ai_agent``. Required. Default value is + "azure_ai_agent". + :vartype type: str + :ivar name: The unique identifier of the Azure AI agent. Required. + :vartype name: str + :ivar version: The version of the Azure AI agent. + :vartype version: str + :ivar tool_descriptions: The parameters used to control the sampling behavior of the agent + during text generation. + :vartype tool_descriptions: list[~azure.ai.projects.models.ToolDescription] + """ + + type: Required[Literal["azure_ai_agent"]] + """The type of target, always ``azure_ai_agent``. Required. Default value is \"azure_ai_agent\".""" + name: Required[str] + """The unique identifier of the Azure AI agent. Required.""" + version: str + """The version of the Azure AI agent.""" + tool_descriptions: List[ToolDescription] + """The parameters used to control the sampling behavior of the agent during text generation.""" + + +class TargetCompletionEvalRunDataSource(TypedDict, total=False): + """Represents a data source for target-based completion evaluation configuration. + + :ivar type: The type of data source, always ``azure_ai_target_completions``. Required. Default + value is "azure_ai_target_completions". + :vartype type: str + :ivar input_messages: Input messages configuration. + :vartype input_messages: + ~azure.ai.projects.models.CreateEvalCompletionsRunDataSourceInputMessagesItemReference + :ivar source: The source configuration for inline or file data. Required. Is either a + SourceFileContent type or a SourceFileID type. + :vartype source: ~azure.ai.projects.models.SourceFileContent or + ~azure.ai.projects.models.SourceFileID + :ivar target: The target configuration for the evaluation. Required. + :vartype target: ~azure.ai.projects.models.Target + """ + + type: Required[Literal["azure_ai_target_completions"]] + """The type of data source, always ``azure_ai_target_completions``. Required. Default value is + \"azure_ai_target_completions\".""" + source: Required[Union[SourceFileContent, SourceFileID]] + """The source configuration for inline or file data. Required. Is either a + SourceFileContent type or a SourceFileID type.""" + target: Required[AzureAIAgentTarget] + """The target configuration for the evaluation. Required.""" + input_messages: Required[InputMessagesItemReference] + """Input messages configuration.""" class EvalGraderAzureAIEvaluator(TypedDict, total=False): diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_evaluation.alt.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_evaluation.alt.py new file mode 100644 index 000000000000..cd445f20a0f3 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_evaluation.alt.py @@ -0,0 +1,155 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ +""" +DESCRIPTION: + This sample demonstrates how to create and run an evaluation for an Azure AI agent + using the synchronous AIProjectClient. + + The OpenAI compatible Evals calls in this sample are made using + the OpenAI client from the `openai` package. See https://platform.openai.com/docs/api-reference + for more information. + +USAGE: + python sample_agent_evaluation.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0" python-dotenv + + Set these environment variables with your own values: + 1) FOUNDRY_PROJECT_ENDPOINT - The Azure AI Project endpoint, as found in the Overview + page of your Microsoft Foundry portal. + 2) FOUNDRY_AGENT_NAME - The name of the AI agent to use for evaluation. + 3) FOUNDRY_MODEL_NAME - The deployment name of the AI model, as found under the "Name" column in + the "Models + endpoints" tab in your Microsoft Foundry project. +""" + +import os +import time +from typing import Union +from pprint import pprint +from dotenv import load_dotenv +from openai.types.evals.create_eval_completions_run_data_source_param import SourceFileContent, SourceFileContentContent +from openai.types.eval_create_params import DataSourceConfigCustom +from openai.types.evals.run_create_response import RunCreateResponse +from openai.types.evals.run_retrieve_response import RunRetrieveResponse +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from azure.ai.projects.models import ( + AzureAIAgentTarget, + EvalGraderAzureAIEvaluator, + PromptAgentDefinition, + TargetCompletionEvalRunDataSource, +) + +load_dotenv() +endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] +model_deployment_name = os.environ.get("FOUNDRY_MODEL_NAME", "") # Sample : gpt-4o-mini +agent_name = os.environ["FOUNDRY_AGENT_NAME"] + +# [START agent_evaluation_basic] +with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, + project_client.get_openai_client() as openai_client, +): + agent = project_client.agents.create_version( + agent_name=agent_name, + definition=PromptAgentDefinition( + model=model_deployment_name, + instructions="You are a helpful assistant that answers general questions", + ), + ) + print(f"Agent created (id: {agent.id}, name: {agent.name}, version: {agent.version})") + + data_source_config = DataSourceConfigCustom( + type="custom", + item_schema={"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}, + include_sample_schema=True, + ) + # Notes: for data_mapping: + # sample.output_text is the string output of the agent + # sample.output_items is the structured JSON output of the agent, including tool calls information + testing_criteria = [ + EvalGraderAzureAIEvaluator( + type="azure_ai_evaluator", + name="violence_detection", + evaluator_name="builtin.violence", + data_mapping={"query": "{{item.query}}", "response": "{{sample.output_text}}"}, + ), + EvalGraderAzureAIEvaluator( + type="azure_ai_evaluator", + name="fluency", + evaluator_name="builtin.fluency", + initialization_parameters={"deployment_name": f"{model_deployment_name}"}, + data_mapping={"query": "{{item.query}}", "response": "{{sample.output_text}}"}, + ), + EvalGraderAzureAIEvaluator( + type="azure_ai_evaluator", + name="task_adherence", + evaluator_name="builtin.task_adherence", + initialization_parameters={"deployment_name": f"{model_deployment_name}"}, + data_mapping={"query": "{{item.query}}", "response": "{{sample.output_items}}"}, + ), + ] + eval_object = openai_client.evals.create( + name="Agent Evaluation", + data_source_config=data_source_config, + testing_criteria=testing_criteria, # type: ignore + ) + print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") + + data_source = TargetCompletionEvalRunDataSource( + type="azure_ai_target_completions", + source=SourceFileContent( + type="file_content", + content=[ + SourceFileContentContent(item={"query": "What is the capital of France?"}), + SourceFileContentContent(item={"query": "How do I reverse a string in Python?"}), + ], + ), + input_messages={ + "type": "template", # TODO: This is not an option based on our TypeSpec.. + "template": [ + {"type": "message", "role": "user", "content": {"type": "input_text", "text": "{{item.query}}"}} + ], + }, + target=AzureAIAgentTarget( + type="azure_ai_agent", + name=agent.name, + version=agent.version, # Version is optional. Defaults to latest version if not specified + ), + ) + + agent_eval_run: Union[RunCreateResponse, RunRetrieveResponse] = openai_client.evals.runs.create( + eval_id=eval_object.id, name=f"Evaluation Run for Agent {agent.name}", data_source=data_source # type: ignore + ) + print(f"Evaluation run created (id: {agent_eval_run.id})") + # [END agent_evaluation_basic] + + while agent_eval_run.status not in ["completed", "failed"]: + agent_eval_run = openai_client.evals.runs.retrieve(run_id=agent_eval_run.id, eval_id=eval_object.id) + print(f"Waiting for eval run to complete... current status: {agent_eval_run.status}") + time.sleep(5) + + if agent_eval_run.status == "completed": + print("\n✓ Evaluation run completed successfully!") + print(f"Result Counts: {agent_eval_run.result_counts}") + + output_items = list( + openai_client.evals.runs.output_items.list(run_id=agent_eval_run.id, eval_id=eval_object.id) + ) + print(f"\nOUTPUT ITEMS (Total: {len(output_items)})") + print(f"{'-'*60}") + pprint(output_items) + print(f"{'-'*60}") + else: + print("\n✗ Evaluation run failed.") + + openai_client.evals.delete(eval_id=eval_object.id) + print("Evaluation deleted") + + project_client.agents.delete(agent_name=agent.name) + print("Agent deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py index 9a60ca4c65ef..0c284b0253a4 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py @@ -107,7 +107,7 @@ }, {"role": "user", "content": "Output: {{item.response}}}}\nGround Truth: {{item.ground_truth}}"}, ], - "image_tag": "2025-05-08", # What is this doing here? OpenAI only defines this for the PythonGrader (type="python") + "image_tag": "2025-05-08", # What is this doing here? OpenAI only defines this for the PythonGrader (type="python") "pass_threshold": 0.5, }, ] diff --git a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py index c797f8e1e87c..2d0f589df5d6 100644 --- a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py +++ b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py @@ -173,7 +173,8 @@ class TestSamplesEvaluations(AzureRecordedTestCase): "sample_eval_catalog_prompt_based_evaluators.py", # For some reason fails with 500 (Internal server error) "sample_eval_upload_custom_evaluator.py", # TODO: Need to add recordings "sample_eval_upload_friendly_evaluator.py", # TODO: Need to add recordings - "sample_evaluations_graders.alt.py", # Need to remove "image_tag": "2025-05-08" from the recordings, once .alt.py relplaces the original + "sample_evaluations_graders.alt.py", # Need to remove "image_tag": "2025-05-08" from the recordings, once .alt.py relplaces the original + "sample_agent_evaluation.alt.py", # This is the same sample as sample_agent_evaluation.py but using Azure-specific grader/data source types, so we want to test both ], ), ) From afb9ee7162fb7424d892fd9a1c6c2aa699554c22 Mon Sep 17 00:00:00 2001 From: Darren Cohen <39422044+dargilco@users.noreply.github.com> Date: Fri, 27 Mar 2026 07:40:49 -0700 Subject: [PATCH 7/7] More --- .../azure/ai/projects/_patch.pyi | 13 +- .../azure/ai/projects/models/_patch.py | 12 +- .../ai/projects/models/_patch_typeddicts.py | 84 +++++++++++ .../sample_agent_response_evaluation.alt.py | 132 ++++++++++++++++++ 4 files changed, 237 insertions(+), 4 deletions(-) create mode 100644 sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation.alt.py diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi index c91f72c5ee2d..8cf3797adc83 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/_patch.pyi @@ -20,14 +20,21 @@ from openai.types.eval_create_params import DataSourceConfig, TestingCriterion from openai.types.eval_create_response import EvalCreateResponse from openai.types.shared_params.metadata import Metadata from ._client import AIProjectClient as AIProjectClientGenerated -from .models import EvalGraderAzureAIEvaluator, TargetCompletionEvalRunDataSource +from .models import ( + EvalGraderAzureAIEvaluator, + TargetCompletionEvalRunDataSource, + AzureAIDataSourceConfig, + AzureAIResponsesEvalRunDataSource, +) class _AzureEvalRuns(Runs): def create( self, eval_id: str, *, - data_source: Union[DataSource, TargetCompletionEvalRunDataSource], # <=== Azure extention here + data_source: Union[ + DataSource, TargetCompletionEvalRunDataSource, AzureAIResponsesEvalRunDataSource + ], # <=== Azure extention here metadata: Optional[Metadata] | Omit = ..., name: str | Omit = ..., extra_headers: Headers | None = ..., @@ -40,7 +47,7 @@ class _AzureEvals(Evals): def create( self, *, - data_source_config: DataSourceConfig, + data_source_config: Union[DataSourceConfig, AzureAIDataSourceConfig], # <=== Azure extention here testing_criteria: Iterable[ Union[ TestingCriterion, diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py index 281687232bdf..c1a5488001ef 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch.py @@ -9,7 +9,14 @@ """ from typing import Final, FrozenSet, List, Dict, Mapping, Optional, Any, Tuple -from ._patch_typeddicts import AzureAIAgentTarget, EvalGraderAzureAIEvaluator, TargetCompletionEvalRunDataSource +from ._patch_typeddicts import ( + AzureAIAgentTarget, + AzureAIDataSourceConfig, + AzureAIResponsesEvalRunDataSource, + EvalGraderAzureAIEvaluator, + ResponseRetrievalItemGenerationParams, + TargetCompletionEvalRunDataSource, +) from azure.core.polling import LROPoller, AsyncLROPoller, PollingMethod, AsyncPollingMethod from azure.core.polling.base_polling import ( LROBasePolling, @@ -349,8 +356,11 @@ def from_continuation_token( __all__: List[str] = [ "AsyncUpdateMemoriesLROPoller", "AzureAIAgentTarget", + "AzureAIDataSourceConfig", + "AzureAIResponsesEvalRunDataSource", "CustomCredential", "EvalGraderAzureAIEvaluator", + "ResponseRetrievalItemGenerationParams", "TargetCompletionEvalRunDataSource", "UpdateMemoriesLROPoller", ] # Add all objects you want publicly available to users at this package level diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typeddicts.py b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typeddicts.py index 62b82d199d20..a71505f3be03 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typeddicts.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_patch_typeddicts.py @@ -14,6 +14,90 @@ from ._models import ToolDescription +class ResponseRetrievalItemGenerationParams(TypedDict, total=False): + """Represents the parameters for response retrieval item generation. + + :ivar type: The type of item generation parameters, always ``response_retrieval``. Required. + The ResponseRetrieval item generation parameters. + :vartype type: str or ~azure.ai.projects.models.RESPONSE_RETRIEVAL + :ivar max_num_turns: The maximum number of turns of chat history to evaluate. Required. + :vartype max_num_turns: int + :ivar data_mapping: Mapping from source fields to response_id field, required for retrieving + chat history. Required. + :vartype data_mapping: dict[str, str] + :ivar source: The source from which JSONL content is read. Required. Is either a + EvalJsonlFileContentSource type or a EvalJsonlFileIdSource type. + :vartype source: ~azure.ai.projects.models.EvalJsonlFileContentSource or + ~azure.ai.projects.models.EvalJsonlFileIdSource + """ + + type: Required[Literal["response_retrieval"]] + """The type of item generation parameters, always ``response_retrieval``. Required. The + ResponseRetrieval item generation parameters.""" + max_num_turns: Required[int] + """The maximum number of turns of chat history to evaluate. Required.""" + data_mapping: Required[Dict[str, str]] + """Mapping from source fields to response_id field, required for retrieving chat history. + Required.""" + source: Required[Union[SourceFileContent, SourceFileID]] + """The source from which JSONL content is read. Required. Is either a SourceFileContent + type or a SourceFileID type.""" + + +class AzureAIResponsesEvalRunDataSource(TypedDict, total=False): + """Represents a data source for evaluation runs that are specific to Continuous Evaluation + scenarios. + + :ivar type: The type of data source, always ``azure_ai_responses``. Required. Default value is + "azure_ai_responses". + :vartype type: str + :ivar item_generation_params: The parameters for item generation. Required. + :vartype item_generation_params: + ~azure.ai.projects.models.ResponseRetrievalItemGenerationParams + :ivar max_runs_hourly: Maximum number of evaluation runs allowed per hour. Required. + :vartype max_runs_hourly: int + :ivar event_configuration_id: The event configuration name associated with this evaluation run. + Required. + :vartype event_configuration_id: str + """ + + type: Required[Literal["azure_ai_responses"]] + """The type of data source, always ``azure_ai_responses``. Required. Default value is + \"azure_ai_responses\".""" + item_generation_params: Required[ResponseRetrievalItemGenerationParams] + """The parameters for item generation. Required.""" + max_runs_hourly: Required[int] + """Maximum number of evaluation runs allowed per hour. Required.""" + event_configuration_id: Required[str] + """The event configuration name associated with this evaluation run. Required.""" + + +class AzureAIDataSourceConfig(TypedDict, total=False): + """AzureAIDataSourceConfig. + + You probably want to use the sub-classes and not this class directly. Known sub-classes are: + AzureAIBenchmarkDataSourceConfig + + :ivar schema: The overall object JSON schema for the run data source items. Required. + :vartype schema: dict[str, any] + :ivar type: The object type, which is always ``azure_ai_source``. Required. Default value is + "azure_ai_source". + :vartype type: str + :ivar scenario: Data schema scenario. Required. Is one of the following types: + Literal["red_team"], Literal["responses"], Literal["traces_preview"], + Literal["synthetic_data_gen_preview"], Literal["benchmark_preview"] + :vartype scenario: str or str or str or str or str + """ + + type: Required[Literal["azure_ai_source"]] + """The object type, which is always ``azure_ai_source``. Required. Default value is + \"azure_ai_source\".""" + scenario: Required[str] # TODO: Update typespec to define the below strings as enum + """Data schema scenario. Required. Is one of the following types: Literal[\"red_team\"], + Literal[\"responses\"], Literal[\"traces_preview\"], Literal[\"synthetic_data_gen_preview\"], + Literal[\"benchmark_preview\"]""" + + class AzureAIAgentTarget(TypedDict, total=False): """Represents a target specifying an Azure AI agent. diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation.alt.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation.alt.py new file mode 100644 index 000000000000..3ff0ba700f80 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation.alt.py @@ -0,0 +1,132 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ +""" +DESCRIPTION: + This sample demonstrates how to create and run an evaluation for an Azure AI agent response + using the synchronous AIProjectClient. + + The OpenAI compatible Evals calls in this sample are made using + the OpenAI client from the `openai` package. See https://platform.openai.com/docs/api-reference + for more information. + +USAGE: + python sample_agent_response_evaluation.py + + Before running the sample: + + pip install "azure-ai-projects>=2.0.0" python-dotenv + + Set these environment variables with your own values: + 1) FOUNDRY_PROJECT_ENDPOINT - The Azure AI Project endpoint, as found in the Overview + page of your Microsoft Foundry portal. + 2) FOUNDRY_AGENT_NAME - The name of the AI agent to use for evaluation. + 3) FOUNDRY_MODEL_NAME - The deployment name of the AI model, as found under the "Name" column in + the "Models + endpoints" tab in your Microsoft Foundry project. +""" + +import os +import time +from typing import Union +from pprint import pprint +from dotenv import load_dotenv +from openai.types.evals.create_eval_completions_run_data_source_param import SourceFileContent, SourceFileContentContent +from openai.types.evals.run_create_response import RunCreateResponse +from openai.types.evals.run_retrieve_response import RunRetrieveResponse +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient +from azure.ai.projects.models import ( + AzureAIDataSourceConfig, + AzureAIResponsesEvalRunDataSource, + EvalGraderAzureAIEvaluator, + PromptAgentDefinition, + ResponseRetrievalItemGenerationParams, +) + +load_dotenv() + +endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] + +with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, + project_client.get_openai_client() as openai_client, +): + + agent = project_client.agents.create_version( + agent_name=os.environ["FOUNDRY_AGENT_NAME"], + definition=PromptAgentDefinition( + model=os.environ["FOUNDRY_MODEL_NAME"], + instructions="You are a helpful assistant that answers general questions", + ), + ) + print(f"Agent created (id: {agent.id}, name: {agent.name}, version: {agent.version})") + + conversation = openai_client.conversations.create( + items=[{"type": "message", "role": "user", "content": "What is the size of France in square miles?"}], + ) + print(f"Created conversation with initial user message (id: {conversation.id})") + + response = openai_client.responses.create( + conversation=conversation.id, + extra_body={"agent_reference": {"name": agent.name, "type": "agent_reference"}}, + ) + print(f"Response output: {response.output_text} (id: {response.id})") + + data_source_config = AzureAIDataSourceConfig(type="azure_ai_source", scenario="responses") + testing_criteria = [ + EvalGraderAzureAIEvaluator( + type="azure_ai_evaluator", name="violence_detection", evaluator_name="builtin.violence" + ) + ] + eval_object = openai_client.evals.create( + name="Agent Response Evaluation", + data_source_config=data_source_config, # type: ignore + testing_criteria=testing_criteria, # type: ignore + ) + print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") + + data_source = AzureAIResponsesEvalRunDataSource( + type="azure_ai_responses", + item_generation_params=ResponseRetrievalItemGenerationParams( + type="response_retrieval", + data_mapping={"response_id": "{{item.resp_id}}"}, + source=SourceFileContent( + type="file_content", + content=[SourceFileContentContent(item={"resp_id": response.id})], + ), + ), + ) + + response_eval_run: Union[RunCreateResponse, RunRetrieveResponse] = openai_client.evals.runs.create( + eval_id=eval_object.id, name=f"Evaluation Run for Agent {agent.name}", data_source=data_source # type: ignore + ) + print(f"Evaluation run created (id: {response_eval_run.id})") + + while response_eval_run.status not in ["completed", "failed"]: + response_eval_run = openai_client.evals.runs.retrieve(run_id=response_eval_run.id, eval_id=eval_object.id) + print(f"Waiting for eval run to complete... current status: {response_eval_run.status}") + time.sleep(5) + + if response_eval_run.status == "completed": + print("\n✓ Evaluation run completed successfully!") + print(f"Result Counts: {response_eval_run.result_counts}") + + output_items = list( + openai_client.evals.runs.output_items.list(run_id=response_eval_run.id, eval_id=eval_object.id) + ) + print(f"\nOUTPUT ITEMS (Total: {len(output_items)})") + print(f"Eval Run Report URL: {response_eval_run.report_url}") + + print(f"{'-'*60}") + pprint(output_items) + print(f"{'-'*60}") + else: + print("\n✗ Evaluation run failed.") + + openai_client.evals.delete(eval_id=eval_object.id) + print("Evaluation deleted") + + project_client.agents.delete(agent_name=agent.name) + print("Agent deleted")