diff --git a/python/samples/getting_started/evaluation/self_reflection/.env.example b/python/samples/getting_started/evaluation/self_reflection/.env.example index 9f6dc82564..413a62c0ff 100644 --- a/python/samples/getting_started/evaluation/self_reflection/.env.example +++ b/python/samples/getting_started/evaluation/self_reflection/.env.example @@ -1,2 +1,3 @@ AZURE_OPENAI_ENDPOINT="..." AZURE_OPENAI_API_KEY="..." +AZURE_AI_PROJECT_ENDPOINT="https://.services.ai.azure.com/api/projects//" diff --git a/python/samples/getting_started/evaluation/self_reflection/README.md b/python/samples/getting_started/evaluation/self_reflection/README.md index a6ab419b0d..c75aa62ce8 100644 --- a/python/samples/getting_started/evaluation/self_reflection/README.md +++ b/python/samples/getting_started/evaluation/self_reflection/README.md @@ -6,7 +6,7 @@ This sample demonstrates the self-reflection pattern using Agent Framework and A **What it demonstrates:** - Iterative self-reflection loop that automatically improves responses based on groundedness evaluation -- Batch processing of prompts from Parquet files with progress tracking +- Batch processing of prompts from JSONL files with progress tracking - Using `AzureOpenAIChatClient` with Azure CLI authentication - Comprehensive summary statistics and detailed result tracking @@ -18,14 +18,13 @@ This sample demonstrates the self-reflection pattern using Agent Framework and A ### Python Environment ```bash -pip install agent-framework-core azure-ai-evaluation pandas --pre +pip install agent-framework-core azure-ai-projects pandas --pre ``` ### Environment Variables ```bash # .env file -AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/ -AZURE_OPENAI_API_KEY=your-api-key # Optional with Azure CLI +AZURE_AI_PROJECT_ENDPOINT=https://.services.ai.azure.com/api/projects// ``` ## Running the Sample @@ -35,15 +34,15 @@ AZURE_OPENAI_API_KEY=your-api-key # Optional with Azure CLI python self_reflection.py # With options -python self_reflection.py --input my_prompts.parquet \ - --output results.parquet \ +python self_reflection.py --input my_prompts.jsonl \ + --output results.jsonl \ --max-reflections 5 \ -n 10 ``` **CLI Options:** -- `--input`, `-i`: Input parquet file -- `--output`, `-o`: Output parquet file +- `--input`, `-i`: Input JSONL file +- `--output`, `-o`: Output JSONL file - `--agent-model`, `-m`: Agent model name (default: gpt-4.1) - `--judge-model`, `-e`: Evaluator model name (default: gpt-4.1) - `--max-reflections`: Max iterations (default: 3) diff --git a/python/samples/getting_started/evaluation/self_reflection/self_reflection.py b/python/samples/getting_started/evaluation/self_reflection/self_reflection.py index 76ed8d6e65..b5b1d1131a 100644 --- a/python/samples/getting_started/evaluation/self_reflection/self_reflection.py +++ b/python/samples/getting_started/evaluation/self_reflection/self_reflection.py @@ -5,13 +5,20 @@ import time import argparse import pandas as pd +import openai from typing import Any from dotenv import load_dotenv +from openai.types.eval_create_params import DataSourceConfigCustom +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent, +) from agent_framework import ChatAgent, ChatMessage from agent_framework.azure import AzureOpenAIChatClient +from azure.ai.projects import AIProjectClient from azure.identity import AzureCliCredential -from azure.ai.evaluation import GroundednessEvaluator, AzureOpenAIModelConfiguration """ Self-Reflection LLM Runner @@ -41,30 +48,96 @@ DEFAULT_JUDGE_MODEL = "gpt-4.1" -def create_groundedness_evaluator(judge_model: str) -> GroundednessEvaluator: - """ - Create a groundedness evaluator. +def create_openai_client(): + endpoint = os.environ["AZURE_AI_PROJECT_ENDPOINT"] + credential = AzureCliCredential() + project_client = AIProjectClient(endpoint=endpoint, credential=credential) + return project_client.get_openai_client() + + +def create_eval(client: openai.OpenAI, judge_model: str) -> openai.types.EvalCreateResponse: + print("Creating Eval") + data_source_config = DataSourceConfigCustom({ + "type": "custom", + "item_schema": { + "type": "object", + "properties": { + "query": {"type": "string"}, + "response": {"type": "string"}, + "context": {"type": "string"}, + }, + "required": [], + }, + "include_sample_schema": True, + }) + + testing_criteria = [{ + "type": "azure_ai_evaluator", + "name": "groundedness", + "evaluator_name": "builtin.groundedness", + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}", "context": "{{item.context}}"}, + "initialization_parameters": {"deployment_name": f"{judge_model}"}, + }] + + return client.evals.create( + name="Eval", + data_source_config=data_source_config, + testing_criteria=testing_criteria, # type: ignore + ) - Args: - judge_model: Model deployment name for evaluation - Returns: - Configured GroundednessEvaluator - """ - judge_model_config = AzureOpenAIModelConfiguration( - azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"), - api_key=os.environ.get("AZURE_OPENAI_API_KEY"), - api_version="2024-12-01-preview", - azure_deployment=judge_model, + +def run_eval( + client: openai.OpenAI, + eval_object: openai.types.EvalCreateResponse, + query: str, + response: str, + context: str, +): + eval_run_object = client.evals.runs.create( + eval_id=eval_object.id, + name="inline_data_run", + metadata={"team": "eval-exp", "scenario": "inline-data-v1"}, + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content=[ + SourceFileContentContent( + item={ + "query": query, + "context": context, + "response": response, + } + ), + ], + ), + ), ) - return GroundednessEvaluator(model_config=judge_model_config) + + eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) + + MAX_RETRY = 10 + for _ in range(0, MAX_RETRY): + run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) + if run.status == "failed": + print(f"Eval run failed. Run ID: {run.id}, Status: {run.status}, Error: {getattr(run, 'error', 'Unknown error')}") + continue + elif run.status == "completed": + output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) + return output_items + time.sleep(5) + + print("Eval result retrieval timeout.") + return None async def execute_query_with_self_reflection( *, + client: openai.OpenAI, agent: ChatAgent, + eval_object: openai.types.EvalCreateResponse, full_user_query: str, context: str, - evaluator: GroundednessEvaluator, max_self_reflections: int = 3, ) -> dict[str, Any]: """ @@ -108,17 +181,20 @@ async def execute_query_with_self_reflection( # Evaluate groundedness start_time_eval = time.time() - groundedness_res = evaluator( + eval_run_output_items = run_eval( + client=client, + eval_object=eval_object, query=full_user_query, response=agent_response, - context=context + context=context, ) + if eval_run_output_items is None: + print(f" ⚠️ Groundedness evaluation failed (timeout or error) for iteration {i+1}.") + continue + score = eval_run_output_items[0].results[0].score end_time_eval = time.time() total_groundedness_eval_time += (end_time_eval - start_time_eval) - feedback = groundedness_res['groundedness_reason'] - score = int(groundedness_res['groundedness']) - # Store score in structured format iteration_scores.append(score) @@ -144,11 +220,7 @@ async def execute_query_with_self_reflection( # Request improvement reflection_prompt = ( f"The groundedness score of your response is {score}/{max_score}. " - f"Explanation for score: [{feedback}]. " f"Reflect on your answer and improve it to get the maximum score of {max_score} " - f"considering the explanation. Now please provide an updated response, taking into " - f"account the feedback, but make your answer sound as if it was your first response. " - f"Don't refer to the feedback in your answer." ) messages.append(ChatMessage(role="user", text=reflection_prompt)) @@ -226,10 +298,11 @@ async def run_self_reflection_batch( # Configure clients print(f"Configuring Azure OpenAI client...") - - print(f"Creating groundedness evaluator with model: {judge_model}") - evaluator = create_groundedness_evaluator(judge_model) - + client = create_openai_client() + + # Create Eval + eval_object = create_eval(client=client, judge_model=judge_model) + # Process each prompt print(f"Max self-reflections: {max_self_reflections}\n") @@ -239,10 +312,11 @@ async def run_self_reflection_batch( try: result = await execute_query_with_self_reflection( + client=client, agent=agent, + eval_object=eval_object, full_user_query=row['full_prompt'], context=row['context_document'], - evaluator=evaluator, max_self_reflections=max_self_reflections, )