diff --git a/.env.example b/.env.example index e6b29a8..696b59c 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,6 @@ -# OpenAI-compatible LLM (using Gemini via OpenAI API) OPENAI_BASE_URL="https://generativelanguage.googleapis.com/v1beta/openai/" OPENAI_API_KEY="..." # Or use GEMINI_API_KEY or GOOGLE_API_KEY +GEMINI_API_KEY="..." # Or use GOOGLE_API_KEY # Model selection (see https://ai.google.dev/gemini-api/docs/models) # Stable: gemini-2.5-pro, gemini-2.5-flash, gemini-2.5-flash-lite diff --git a/aieng-eval-agents/aieng/agent_evals/async_client_manager.py b/aieng-eval-agents/aieng/agent_evals/async_client_manager.py index dadf669..b80b015 100644 --- a/aieng-eval-agents/aieng/agent_evals/async_client_manager.py +++ b/aieng-eval-agents/aieng/agent_evals/async_client_manager.py @@ -4,6 +4,7 @@ like OpenAI to prevent event loop conflicts during Gradio's hot-reload process. """ +import logging import sqlite3 from pathlib import Path from typing import Any @@ -13,6 +14,10 @@ from openai import AsyncOpenAI +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") +logger = logging.getLogger(__name__) + + class SQLiteConnection: """SQLite connection.""" @@ -27,7 +32,7 @@ def __init__(self, db_path: Path) -> None: self.db_path = db_path self.connection = sqlite3.connect(db_path) - def execute(self, query: str) -> list[Any]: + def execute(self, query: str) -> list[Any] | str: """Execute a SQLite query. Parameters @@ -37,11 +42,16 @@ def execute(self, query: str) -> list[Any]: Returns ------- - list[Any] + list[Any] | str The result of the query. Will return the result of `execute(query).fetchall()`. + Returns a string with an error message if the query fails. """ - return self.connection.execute(query).fetchall() + try: + return self.connection.execute(query).fetchall() + except Exception as e: + logger.exception(f"Error executing query: {e}") + return [str(e)] def close(self) -> None: """Close the SQLite connection.""" diff --git a/aieng-eval-agents/aieng/agent_evals/configs.py b/aieng-eval-agents/aieng/agent_evals/configs.py index ce8d0ec..9255cef 100644 --- a/aieng-eval-agents/aieng/agent_evals/configs.py +++ b/aieng-eval-agents/aieng/agent_evals/configs.py @@ -91,6 +91,11 @@ class Configs(BaseSettings): validation_alias=AliasChoices("OPENAI_API_KEY", "GEMINI_API_KEY", "GOOGLE_API_KEY"), description="API key for OpenAI-compatible API (accepts OPENAI_API_KEY, GEMINI_API_KEY, or GOOGLE_API_KEY).", ) + gemini_api_key: SecretStr = Field( + default=SecretStr("default-gemini-api-key"), # setting a default so some implementations can run without it + validation_alias=AliasChoices("GEMINI_API_KEY", "GOOGLE_API_KEY"), + description="API key for Google/Gemini API (accepts GEMINI_API_KEY, or GOOGLE_API_KEY).", + ) default_planner_model: str = Field( default="gemini-2.5-pro", description="Model name for planning/complex reasoning tasks.", diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/trace.py b/aieng-eval-agents/aieng/agent_evals/evaluation/trace.py index 34428dd..4fa543f 100644 --- a/aieng-eval-agents/aieng/agent_evals/evaluation/trace.py +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/trace.py @@ -349,6 +349,7 @@ def _sum_token_usage(observations: list[ObservationsView], *, token_type: str) - usage_keys = _usage_keys_for_token_type(token_type) for observation in observations: + assert observation.usage_details is not None, "Usage details must be present" usage_details = observation.usage_details for key in usage_keys: value = usage_details.get(key) diff --git a/aieng-eval-agents/aieng/agent_evals/langfuse.py b/aieng-eval-agents/aieng/agent_evals/langfuse.py index 8102ae3..dc0b831 100644 --- a/aieng-eval-agents/aieng/agent_evals/langfuse.py +++ b/aieng-eval-agents/aieng/agent_evals/langfuse.py @@ -8,8 +8,6 @@ from pathlib import Path from typing import Any, Literal -import logfire -import nest_asyncio from aieng.agent_evals.async_client_manager import AsyncClientManager from aieng.agent_evals.configs import Configs from aieng.agent_evals.progress import track_with_progress @@ -24,19 +22,6 @@ logger = logging.getLogger(__name__) -def configure_oai_agents_sdk(service_name: str) -> None: - """Register Langfuse as tracing provider for OAI Agents SDK. - - Parameters - ---------- - service_name : str - The name of the service to configure. - """ - nest_asyncio.apply() - logfire.configure(service_name=service_name, send_to_logfire=False, scrubbing=False) - logfire.instrument_openai_agents() - - def set_up_langfuse_otlp_env_vars(): """Set up environment variables for Langfuse OpenTelemetry integration. @@ -71,7 +56,6 @@ def setup_langfuse_tracer(service_name: str = "aieng-eval-agents") -> "trace.Tra tracer: OpenTelemetry Tracer """ set_up_langfuse_otlp_env_vars() - configure_oai_agents_sdk(service_name) # Create a TracerProvider for OpenTelemetry trace_provider = TracerProvider() diff --git a/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py b/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py index c7fa14c..44d11a9 100644 --- a/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py +++ b/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py @@ -13,12 +13,21 @@ >>> ) """ +import logging +from enum import Enum from pathlib import Path +from typing import Any -import agents from aieng.agent_evals.async_client_manager import AsyncClientManager from aieng.agent_evals.langfuse import setup_langfuse_tracer from aieng.agent_evals.report_generation.file_writer import ReportFileWriter +from google.adk.agents import Agent +from google.adk.events.event import Event +from pydantic import BaseModel + + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") +logger = logging.getLogger(__name__) def get_report_generation_agent( @@ -26,7 +35,7 @@ def get_report_generation_agent( sqlite_db_path: Path, reports_output_path: Path, langfuse_project_name: str | None, -) -> agents.Agent: +) -> Agent: """ Define the report generation agent. @@ -54,29 +63,156 @@ def get_report_generation_agent( client_manager = AsyncClientManager.get_instance() report_file_writer = ReportFileWriter(reports_output_path) - # Define an agent using the OpenAI Agent SDK - return agents.Agent( - name="Report Generation Agent", # Agent name for logging and debugging purposes - instructions=instructions, # System instructions for the agent - # Tools available to the agent - # We wrap the `execute_sql_query` and `write_report_to_file` methods - # with `function_tool`, which will construct the tool definition JSON - # schema by extracting the necessary information from the method - # signature and docstring. + # Define an agent using Google ADK + return Agent( + name="ReportGenerationAgent", + model=client_manager.configs.default_worker_model, + instruction=instructions, tools=[ - agents.function_tool( - client_manager.sqlite_connection(sqlite_db_path).execute, - name_override="execute_sql_query", - description_override="Execute a SQL query against the SQLite database.", - ), - agents.function_tool( - report_file_writer.write, - name_override="write_report_to_file", - description_override="Write the report data to a downloadable XLSX file.", - ), + client_manager.sqlite_connection(sqlite_db_path).execute, + report_file_writer.write_xlsx, ], - model=agents.OpenAIChatCompletionsModel( - model=client_manager.configs.default_worker_model, - openai_client=client_manager.openai_client, - ), ) + + +class EventType(Enum): + """Types of events from agents.""" + + FINAL_RESPONSE = "final_response" + TOOL_CALL = "tool_call" + THOUGHT = "thought" + TOOL_RESPONSE = "tool_response" + + +class ParsedEvent(BaseModel): + """Parsed event from an agent.""" + + type: EventType + text: str + arguments: Any | None = None + + +class EventParser: + """Parser for agent events.""" + + @classmethod + def parse(cls, event: Event) -> list[ParsedEvent]: + """Parse an agent event into a list of parsed events. + + The event can be a final response, a thought, a tool call, + or a tool response. + + Parameters + ---------- + event : Event + The event to parse. + + Returns + ------- + list[ParsedEvent] + A list of parsed events. + """ + parsed_events = [] + + if event.is_final_response(): + parsed_events.extend(cls._parse_final_response(event)) + + elif event.content: + if event.content.role == "model": + parsed_events.extend(cls._parse_model_response(event)) + + elif event.content.role == "user": + parsed_events.extend(cls._parse_user_response(event)) + + else: + logger.warning(f"Unknown content role '{event.content.role}': {event}") + + else: + logger.warning(f"Unknown stream event: {event}") + + return parsed_events + + @classmethod + def _parse_final_response(cls, event: Event) -> list[ParsedEvent]: + if ( + not event.content + or not event.content.parts + or len(event.content.parts) == 0 + or not event.content.parts[0].text + ): + logger.warning(f"Final response's content is not valid: {event}") + return [] + + return [ + ParsedEvent( + type=EventType.FINAL_RESPONSE, + text=event.content.parts[0].text, + ) + ] + + @classmethod + def _parse_model_response(cls, event: Event) -> list[ParsedEvent]: + if not event.content or not event.content.parts: + logger.warning(f"Model response's content is not valid: {event}") + return [] + + parsed_events = [] + + for part in event.content.parts: + # Parsing tool calls and their arguments + if part.function_call: + if not part.function_call.name: + logger.warning(f"No name in function call: {part}") + continue + + parsed_events.append( + ParsedEvent( + type=EventType.TOOL_CALL, + text=part.function_call.name, + arguments=part.function_call.args, + ) + ) + + # Parsing the agent's thoughts + elif part.thought_signature or (part.text and not part.thought_signature): + if not part.text: + logger.warning(f"No text in part: {part}") + continue + + parsed_events.append( + ParsedEvent( + type=EventType.THOUGHT, + text=part.text, + ) + ) + + else: + logger.warning(f"Unknown part type: {part}") + + return parsed_events + + @classmethod + def _parse_user_response(cls, event: Event) -> list[ParsedEvent]: + if not event.content or not event.content.parts: + logger.warning(f"Model response's content is not valid: {event}") + return [] + + parsed_events = [] + + for part in event.content.parts: + if part.function_response: + if not part.function_response.name: + logger.warning(f"No name in function response: {part}") + continue + + parsed_events.append( + ParsedEvent( + type=EventType.TOOL_RESPONSE, + text=part.function_response.name, + arguments=part.function_response.response, + ) + ) + else: + logger.warning(f"Unknown part type: {part}") + + return parsed_events diff --git a/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation.py b/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation.py index c4101d3..7ee6807 100644 --- a/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation.py +++ b/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation.py @@ -16,9 +16,8 @@ from pathlib import Path from typing import Any -import agents from aieng.agent_evals.async_client_manager import AsyncClientManager -from aieng.agent_evals.report_generation.agent import get_report_generation_agent +from aieng.agent_evals.report_generation.agent import EventParser, EventType, get_report_generation_agent from aieng.agent_evals.report_generation.prompts import ( MAIN_AGENT_INSTRUCTIONS, RESULT_EVALUATOR_INSTRUCTIONS, @@ -26,12 +25,14 @@ TRAJECTORY_EVALUATOR_INSTRUCTIONS, TRAJECTORY_EVALUATOR_TEMPLATE, ) +from google.adk.agents import Agent +from google.adk.events.event import Event +from google.adk.runners import Runner +from google.adk.sessions import InMemorySessionService +from google.genai import Client +from google.genai.types import Content, GenerateContentConfig, Part from langfuse._client.datasets import DatasetItemClient from langfuse.experiment import Evaluation, LocalExperimentItem -from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall -from openai.types.responses.response_output_message import ResponseOutputMessage -from openai.types.responses.response_output_refusal import ResponseOutputRefusal -from openai.types.responses.response_output_text import ResponseOutputText from pydantic import BaseModel from tenacity import retry, stop_after_attempt, wait_exponential @@ -62,7 +63,7 @@ async def evaluate( sqlite_db_path: Path, reports_output_path: Path, langfuse_project_name: str, -): +) -> None: """Evaluate the report generation agent against a Langfuse dataset. Parameters @@ -159,38 +160,41 @@ async def run(self, *, item: LocalExperimentItem | DatasetItemClient, **kwargs: ) # Handle both TypedDict and class access patterns item_input = item["input"] if isinstance(item, dict) else item.input - result = await run_agent_with_retry(report_generation_agent, item_input) + events = await run_agent_with_retry(report_generation_agent, item_input) # Extract the report data and trajectory from the agent's response - actions = [] - parameters = [] - final_report = None - for raw_response in result.raw_responses: - for output in raw_response.output: - # The trajectory will be the list of actions and the - # parameters passed to each one of them - if isinstance(output, ResponseFunctionToolCall): - actions.append(output.name) - parameters.append(output.arguments) + actions: list[str] = [] + parameters: list[Any | None] = [] + final_report: str | None = None + + # The trajectory will be the list of actions and the + # parameters passed to each one of them + for event in events: + parsed_events = EventParser.parse(event) + + for parsed_event in parsed_events: + if parsed_event.type == EventType.FINAL_RESPONSE: + # Picking up the final message displayed to the user + actions.append("final_response") + parameters.append(parsed_event.text) + + if parsed_event.type == EventType.TOOL_CALL: + # Picking up tool calls and their arguments + actions.append(parsed_event.text) + parameters.append(parsed_event.arguments) # The final report will be the arguments sent by the - # write_report_to_file function call - # If there is more than one call to the write_report_to_file - # function, the last one will be used because the previous - # calls were likely be failed calls - if isinstance(output, ResponseFunctionToolCall) and "write_report_to_file" in output.name: - final_report = output.arguments - - if isinstance(output, ResponseOutputMessage): - for content in output.content: - actions.append(content.type) - if isinstance(content, ResponseOutputText): - parameters.append(content.text) - elif isinstance(content, ResponseOutputRefusal): - parameters.append(content.refusal) + # write_xlsx tool call + # If there is more than one call to the write_xlsx tool call, + # the last one will be used because the previous + # calls are likely failed calls + if parsed_event.text == "write_xlsx": + final_report = parsed_event.arguments + + # Not tracking EventType.THOUGHT or EventType.TOOL_RESPONSE if final_report is None: - logger.warning("No call to write_report_to_file function found in the agent's response") + logger.warning("No call to `write_xlsx` function found in the agent's response") return { "final_report": final_report, @@ -232,30 +236,37 @@ async def final_result_evaluator( """ # Define the evaluator agent client_manager = AsyncClientManager.get_instance() - evaluator_agent = agents.Agent( - name="Final Result Evaluator Agent", - instructions=RESULT_EVALUATOR_INSTRUCTIONS, - output_type=EvaluatorResponse, - model=agents.OpenAIChatCompletionsModel( - model=client_manager.configs.default_planner_model, - openai_client=client_manager.openai_client, - ), - ) + # Format the input for the evaluator agent evaluator_input = RESULT_EVALUATOR_TEMPLATE.format( question=input, ground_truth=expected_output["final_report"], proposed_response=output["final_report"], ) - # Run the evaluator agent with retry - result = await run_agent_with_retry(evaluator_agent, evaluator_input) - evaluation_response = result.final_output_as(EvaluatorResponse) - # Return the evaluation result + # Get the additional evaluation instructions if it + # exists for this specific sample + additional_instructions = _get_additional_instructions(expected_output, "final_report") + + client = Client() + response = client.models.generate_content( + model=client_manager.configs.default_worker_model, + contents=evaluator_input, + config=GenerateContentConfig( + system_instruction=RESULT_EVALUATOR_INSTRUCTIONS + additional_instructions, + response_mime_type="application/json", + response_schema=EvaluatorResponse.model_json_schema(), + ), + ) + + # Parsing and returning the evaluation result + assert isinstance(response.parsed, dict), f"response.parsed must be a dictionary: {response.parsed}" + evaluator_response = EvaluatorResponse(**response.parsed) + return Evaluation( name="Final Result", - value=evaluation_response.is_answer_correct, - comment=evaluation_response.explanation, + value=evaluator_response.is_answer_correct, + comment=evaluator_response.explanation, ) @@ -290,15 +301,6 @@ async def trajectory_evaluator( """ # Define the evaluator agent client_manager = AsyncClientManager.get_instance() - evaluator_agent = agents.Agent( - name="Trajectory Evaluator Agent", - instructions=TRAJECTORY_EVALUATOR_INSTRUCTIONS, - output_type=EvaluatorResponse, - model=agents.OpenAIChatCompletionsModel( - model=client_manager.configs.default_planner_model, - openai_client=client_manager.openai_client, - ), - ) assert isinstance(expected_output["trajectory"], dict), "Expected trajectory must be a dictionary" assert isinstance(output["trajectory"], dict), "Actual trajectory must be a dictionary" @@ -311,20 +313,43 @@ async def trajectory_evaluator( actual_actions=output["trajectory"]["actions"], actual_parameters=output["trajectory"]["parameters"], ) - # Run the evaluator agent with retry - result = await run_agent_with_retry(evaluator_agent, evaluator_input) - evaluation_response = result.final_output_as(EvaluatorResponse) - # Return the evaluation result + # Get the additional evaluation instructions if it + # exists for this specific sample + additional_instructions = _get_additional_instructions(expected_output, "trajectory") + + client = Client() + response = client.models.generate_content( + model=client_manager.configs.default_worker_model, + contents=evaluator_input, + config=GenerateContentConfig( + system_instruction=TRAJECTORY_EVALUATOR_INSTRUCTIONS + additional_instructions, + response_mime_type="application/json", + response_schema=EvaluatorResponse.model_json_schema(), + ), + ) + + # Parsing and returning the evaluation result + assert isinstance(response.parsed, dict), f"response.parsed must be a dictionary: {response.parsed}" + evaluator_response = EvaluatorResponse(**response.parsed) + return Evaluation( name="Trajectory", - value=evaluation_response.is_answer_correct, - comment=evaluation_response.explanation, + value=evaluator_response.is_answer_correct, + comment=evaluator_response.explanation, ) +def _get_additional_instructions(expected_output: EvaluationOutput, key: str) -> str: + additional_instructions_dict = expected_output.get("additional_instructions", {}) + if additional_instructions_dict: + return additional_instructions_dict.get(key, "") + + return "" + + @retry(stop=stop_after_attempt(5), wait=wait_exponential()) -async def run_agent_with_retry(agent: agents.Agent, agent_input: str) -> agents.RunResult: +async def run_agent_with_retry(agent: Agent, agent_input: str) -> list[Event]: """Run an agent with Tenacity's retry mechanism. Parameters @@ -336,8 +361,28 @@ async def run_agent_with_retry(agent: agents.Agent, agent_input: str) -> agents. Returns ------- - agents.RunnerResult - The result of the agent run. + list[Event] + The events from the agent run. """ logger.info(f"Running agent {agent.name} with input '{agent_input[:100]}...'") - return await agents.Runner.run(agent, input=agent_input) + + # Create session and runner + session_service = InMemorySessionService() + runner = Runner(app_name=agent.name, agent=agent, session_service=session_service) + current_session = await session_service.create_session( + app_name=agent.name, + user_id="user", + state={}, + ) + + # create the user message and run the agent + content = Content(role="user", parts=[Part(text=agent_input)]) + events = [] + async for event in runner.run_async( + user_id="user", + session_id=current_session.id, + new_message=content, + ): + events.append(event) + + return events diff --git a/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py b/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py index 60dc09e..8c33585 100644 --- a/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py +++ b/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py @@ -11,6 +11,7 @@ ... ) """ +import logging import urllib.parse from pathlib import Path from typing import Any @@ -18,8 +19,12 @@ import pandas as pd +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") +logger = logging.getLogger(__name__) + + class ReportFileWriter: - """Write reports to an XLSX file.""" + """Write reports to a file.""" def __init__(self, reports_output_path: Path): """Initialize the report writer. @@ -31,7 +36,7 @@ def __init__(self, reports_output_path: Path): """ self.reports_output_path = reports_output_path - def write( + def write_xlsx( self, report_data: list[Any], report_columns: list[str], @@ -57,16 +62,22 @@ def write( str The path to the report file. If `gradio_link` is True, will return a URL link that allows Gradio UI to download the file. + Returns a string with an error message if the report fails to write. """ - # Create reports directory if it doesn't exist - self.reports_output_path.mkdir(exist_ok=True) - filepath = self.reports_output_path / filename + try: + # Create reports directory if it doesn't exist + self.reports_output_path.mkdir(exist_ok=True) + filepath = self.reports_output_path / filename + + report_df = pd.DataFrame(report_data, columns=report_columns) + report_df.to_excel(filepath, index=False) - report_df = pd.DataFrame(report_data, columns=report_columns) - report_df.to_excel(filepath, index=False) + file_uri = str(filepath) + if gradio_link: + file_uri = f"gradio_api/file={urllib.parse.quote(str(file_uri), safe='')}" - file_uri = str(filepath) - if gradio_link: - file_uri = f"gradio_api/file={urllib.parse.quote(str(file_uri), safe='')}" + return file_uri - return file_uri + except Exception as e: + logger.exception(f"Error writing report: {e}") + return str(e) diff --git a/aieng-eval-agents/aieng/agent_evals/report_generation/prompts.py b/aieng-eval-agents/aieng/agent_evals/report_generation/prompts.py index 4a8ca19..4ba2b74 100644 --- a/aieng-eval-agents/aieng/agent_evals/report_generation/prompts.py +++ b/aieng-eval-agents/aieng/agent_evals/report_generation/prompts.py @@ -7,7 +7,7 @@ For best performance, divide complex queries into simpler sub-queries. \ Do not make up information. \ When the report is done, use the report file writer tool to write it to a file. \ -Make sure the write_report_to_file tool is called so it generates the report file. \ +Make sure the "write_xlsx" tool is called so it generates the report file. \ At the end, provide the report file as a downloadable hyperlink to the user. \ Make sure the link can be clicked on by the user. """ @@ -25,7 +25,7 @@ - A list of parameters that has been passed to each one of the actions\ It's OK if the agent makes mistakes and performs additional steps, or if the queries do not exactly match\ the description, as long as the queries performed end up satisfying the "Question".\ -It is important that the last action to be of type "output_text" and that itproduces a link to the report file. +It is important that the last action to be of type "final_response" and that it produces a link to the report file. """ TRAJECTORY_EVALUATOR_TEMPLATE = """\ @@ -45,13 +45,17 @@ """ RESULT_EVALUATOR_INSTRUCTIONS = """\ -Evaluate whether the "Proposed Answer" to the given "Question" matches the "Ground Truth".\ -Disregard the following aspects when comparing the "Proposed Answer" to the "Ground Truth":\ -- The order of the items should not matter, unless explicitly specified in the "Question".\ -- The formatting of the values should not matter, unless explicitly specified in the "Question".\ -- The column and row names have to be similar but not necessarily exact, unless explicitly specified in the "Question".\ -- The filename has to be similar by name but not necessarily exact, unless explicitly specified in the "Question".\ -- The numerical values should be equal to the second decimal place. +Evaluate whether the "Proposed Answer" to the given "Question" matches the "Ground Truth". \ +Disregard the following aspects when comparing the "Proposed Answer" to the "Ground Truth": \ +- The order of the items should not matter, unless explicitly specified in the "Question". \ +- The formatting of the values should not matter, unless explicitly specified in the "Question". \ +- The column and row names have to be similar but not necessarily exact, unless explicitly specified in the "Question". \ +- The filename has to be similar by name but not necessarily exact, unless explicitly specified in the "Question". \ +- It is ok if the filename is missing. \ +- The numerical values should be equal with a tolerance of 0.01. \ +- The report data in the "Proposed Answer" should have the same number of rows as in the "Ground Truth". \ +- It is OK if the report data in the "Proposed Answer" contains extra columns or if the rows are in a different order, \ +unless explicitly specified in the "Question". """ RESULT_EVALUATOR_TEMPLATE = """\ diff --git a/aieng-eval-agents/aieng/agent_evals/utils.py b/aieng-eval-agents/aieng/agent_evals/utils.py deleted file mode 100644 index 6138ece..0000000 --- a/aieng-eval-agents/aieng/agent_evals/utils.py +++ /dev/null @@ -1,110 +0,0 @@ -"""Utility functions for the report generation agent.""" - -import uuid -from typing import Any - -from agents import SQLiteSession, StreamEvent, stream_events -from agents.items import ToolCallOutputItem -from gradio.components.chatbot import ChatMessage, MetadataDict -from openai.types.responses import ResponseFunctionToolCall, ResponseOutputText -from openai.types.responses.response_completed_event import ResponseCompletedEvent -from openai.types.responses.response_output_message import ResponseOutputMessage - - -def oai_agent_stream_to_gradio_messages(stream_event: StreamEvent) -> list[ChatMessage]: - """Parse agent sdk "stream event" into a list of gr messages. - - Adds extra data for tool use to make the gradio display informative. - - Parameters - ---------- - stream_event : StreamEvent - The stream event from the agent SDK. - - Returns - ------- - list[ChatMessage] - A list of Gradio chat messages parsed from the stream event. - """ - output: list[ChatMessage] = [] - - if isinstance(stream_event, stream_events.RawResponsesStreamEvent): - data = stream_event.data - if isinstance(data, ResponseCompletedEvent): - # The completed event may contain multiple output messages, - # including tool calls and final outputs. - # If there is at least one tool call, we mark the response as a thought. - is_thought = len(data.response.output) > 1 and any( - isinstance(message, ResponseFunctionToolCall) for message in data.response.output - ) - - for message in data.response.output: - if isinstance(message, ResponseOutputMessage): - for _item in message.content: - if isinstance(_item, ResponseOutputText): - output.append( - ChatMessage( - role="assistant", - content=_item.text, - metadata={ - "title": "🧠 Thought", - "id": data.sequence_number, - } - if is_thought - else MetadataDict(), - ) - ) - elif isinstance(message, ResponseFunctionToolCall): - output.append( - ChatMessage( - role="assistant", - content=f"```\n{message.arguments}\n```", - metadata={ - "title": f"🛠️ Used tool `{message.name}`", - }, - ) - ) - - elif isinstance(stream_event, stream_events.RunItemStreamEvent): - name = stream_event.name - item = stream_event.item - - if name == "tool_output" and isinstance(item, ToolCallOutputItem): - output.append( - ChatMessage( - role="assistant", - content=f"```\n{item.output}\n```", - metadata={ - "title": "*Tool call output*", - "status": "done", # This makes it collapsed by default - }, - ) - ) - - return output - - -def get_or_create_session( - history: list[ChatMessage], - session_state: dict[str, Any], -) -> SQLiteSession: - """Get existing session or create a new one for conversation persistence. - - Parameters - ---------- - history : list[ChatMessage] - The history of the conversation. - session_state : dict[str, Any] - The state of the session. - - Returns - ------- - SQLiteSession - The session instance. - """ - if len(history) == 0: - session = SQLiteSession(session_id=str(uuid.uuid4())) - session_state["session"] = session - else: - session = session_state["session"] - return session diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py index 1e5b5b7..638134b 100644 --- a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py +++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py @@ -18,7 +18,7 @@ class TestKnowledgeGroundedAgent: def mock_config(self): """Create a mock config for testing.""" config = MagicMock() - config.openai_api_key = "test-api-key" + config.gemini_api_key = "test-api-key" config.default_worker_model = "gemini-2.5-flash" return config diff --git a/implementations/knowledge_qa/01_grounding_basics.ipynb b/implementations/knowledge_qa/01_grounding_basics.ipynb index 48c9b23..0401bb2 100644 --- a/implementations/knowledge_qa/01_grounding_basics.ipynb +++ b/implementations/knowledge_qa/01_grounding_basics.ipynb @@ -106,7 +106,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "config = KnowledgeAgentConfig()\nclient = genai.Client(api_key=config.openai_api_key)\n\n# This question requires very recent information (Jan 2026)\n# The non-grounded model will fail since its training data doesn't include this event\nquestion = \"Which day had the highest recorded snowfall in a single day in Toronto?\"\nexpected_answer = \"January 25, 2026\"\n\nconsole.print(f\"\\n[bold]Question:[/bold] {question}\")\nconsole.print(f\"[dim]Expected Answer: {expected_answer}[/dim]\\n\")\n\n# Without grounding - model relies on training data (cutoff before Jan 2026)\nconsole.print(\"[dim]Generating without grounding...[/dim]\")\nresponse_no_grounding = client.models.generate_content(\n model=config.default_worker_model,\n contents=question,\n)\n\n# With grounding - agent uses Google Search tool\nconsole.print(\"[dim]Generating with grounding (ADK agent)...[/dim]\")\nresponse_grounded = await agent.answer_async(question)\n\n# Side-by-side comparison using our display utility\ndisplay_comparison(response_no_grounding.text, response_grounded, console=console)\n\n# Show tool calls from the grounded response\nif response_grounded.tool_calls:\n console.print(\"\\n[bold cyan]🔧 Tool Calls (Grounded):[/bold cyan]\")\n for tc in response_grounded.tool_calls:\n console.print(f\" • {tc.get('name', 'unknown')}: {tc.get('args', {})}\")\n\n# Check if the grounded response contains the correct answer\nif expected_answer.lower() in response_grounded.text.lower() or \"january 25\" in response_grounded.text.lower():\n console.print(\"\\n[green]✓ Grounded response contains the correct answer![/green]\")\nelse:\n console.print(\"\\n[yellow]⚠ Check the grounded response for accuracy[/yellow]\")" + "source": "config = KnowledgeAgentConfig()\nclient = genai.Client(api_key=config.gemini_api_key)\n\n# This question requires very recent information (Jan 2026)\n# The non-grounded model will fail since its training data doesn't include this event\nquestion = \"Which day had the highest recorded snowfall in a single day in Toronto?\"\nexpected_answer = \"January 25, 2026\"\n\nconsole.print(f\"\\n[bold]Question:[/bold] {question}\")\nconsole.print(f\"[dim]Expected Answer: {expected_answer}[/dim]\\n\")\n\n# Without grounding - model relies on training data (cutoff before Jan 2026)\nconsole.print(\"[dim]Generating without grounding...[/dim]\")\nresponse_no_grounding = client.models.generate_content(\n model=config.default_worker_model,\n contents=question,\n)\n\n# With grounding - agent uses Google Search tool\nconsole.print(\"[dim]Generating with grounding (ADK agent)...[/dim]\")\nresponse_grounded = await agent.answer_async(question)\n\n# Side-by-side comparison using our display utility\ndisplay_comparison(response_no_grounding.text, response_grounded, console=console)\n\n# Show tool calls from the grounded response\nif response_grounded.tool_calls:\n console.print(\"\\n[bold cyan]🔧 Tool Calls (Grounded):[/bold cyan]\")\n for tc in response_grounded.tool_calls:\n console.print(f\" • {tc.get('name', 'unknown')}: {tc.get('args', {})}\")\n\n# Check if the grounded response contains the correct answer\nif expected_answer.lower() in response_grounded.text.lower() or \"january 25\" in response_grounded.text.lower():\n console.print(\"\\n[green]✓ Grounded response contains the correct answer![/green]\")\nelse:\n console.print(\"\\n[yellow]⚠ Check the grounded response for accuracy[/yellow]\")" }, { "cell_type": "markdown", diff --git a/implementations/report_generation/data/OnlineRetailReportEval.json b/implementations/report_generation/data/OnlineRetailReportEval.json index 8446317..2212d6d 100644 --- a/implementations/report_generation/data/OnlineRetailReportEval.json +++ b/implementations/report_generation/data/OnlineRetailReportEval.json @@ -24,10 +24,10 @@ }, "trajectory": { "actions": [ - "execute_sql_query", - "execute_sql_query", - "execute_sql_query", - "write_report_to_file", + "execute", + "execute", + "execute", + "write_xlsx", "output_text" ], "description": [ @@ -61,10 +61,10 @@ }, "trajectory": { "actions": [ - "execute_sql_query", - "execute_sql_query", - "execute_sql_query", - "write_report_to_file", + "execute", + "execute", + "execute", + "write_xlsx", "output_text" ], "description": [ @@ -82,18 +82,18 @@ "expected_output": { "final_report": { "report_data": [ - ["2010-12", 369.6727640671277], - ["2011-01", 377.61312204990026], - ["2011-02", 355.50510349750186], - ["2011-03", 343.35029145728674], - ["2011-04", 281.8326405714284], - ["2011-05", 333.79488232579683], - ["2011-06", 343.32991554893266], - ["2011-07", 353.3714268672203], - ["2011-08", 392.5707360552048], - ["2011-09", 438.0101469072177], - ["2011-10", 405.8774336618668], - ["2011-11", 421.86327561327636], + ["2010-12", 369.85531851851886], + ["2011-01", 379.4039701897034], + ["2011-02", 357.54676956209624], + ["2011-03", 344.5623197175999], + ["2011-04", 282.8022482798164], + ["2011-05", 334.56684088806736], + ["2011-06", 343.50055666004044], + ["2011-07", 353.5548059159318], + ["2011-08", 393.0227461139903], + ["2011-09", 438.1983764503665], + ["2011-10", 406.03135001896266], + ["2011-11", 422.2288417099951], ["2011-12", 427.25912315270944] ], "report_columns": ["SaleMonth", "AverageOrderValue"], @@ -101,10 +101,10 @@ }, "trajectory": { "actions": [ - "execute_sql_query", - "execute_sql_query", - "execute_sql_query", - "write_report_to_file", + "execute", + "execute", + "execute", + "write_xlsx", "output_text" ], "description": [ @@ -120,9 +120,12 @@ "id": "4", "input": "Generate a report with the month-over-month trends in sales. The report should include the monthly sales, the month-over-month change and the percentage change.", "expected_output": { + "additional_instructions": { + "final_report": "For the last two items of the first row, 0, None and \"\" are valid values." + }, "final_report": { "report_data": [ - ["2010-12", 748957.02, 748957.02, 0], + ["2010-12", 748957.02, 0, 0], ["2011-01", 560000.26, -188956.76, -25.23], ["2011-02", 498062.65, -61937.61, -11.06], ["2011-03", 683267.08, 185204.43, 37.18], @@ -146,11 +149,11 @@ }, "trajectory": { "actions": [ - "execute_sql_query", - "execute_sql_query", - "execute_sql_query", + "execute", + "execute", + "execute", "output_text", - "write_report_to_file", + "write_xlsx", "output_text" ], "description": [ @@ -236,10 +239,10 @@ }, "trajectory": { "actions": [ - "execute_sql_query", - "execute_sql_query", - "execute_sql_query", - "write_report_to_file", + "execute", + "execute", + "execute", + "write_xlsx", "output_text" ], "description": [ @@ -273,11 +276,11 @@ }, "trajectory": { "actions": [ - "execute_sql_query", - "execute_sql_query", - "execute_sql_query", - "execute_sql_query", - "write_report_to_file", + "execute", + "execute", + "execute", + "execute", + "write_xlsx", "output_text" ], "description": [ @@ -296,19 +299,19 @@ "expected_output": { "final_report": { "report_data": [ - ["2010", 196.46396396396398, 789.2065542676543], - ["2011", 342.7430707154745, 2119.856516843317] + ["2010", 196.46396396396398, 585.03], + ["2011", 342.7430707154745, 1825.04] ], "report_columns": ["Year", "Average Spent by One-Time Buyers", "Average Spent by All Customers"], "filename": "average_spending_report.xlsx" }, "trajectory": { "actions": [ - "execute_sql_query", - "execute_sql_query", - "execute_sql_query", - "execute_sql_query", - "write_report_to_file", + "execute", + "execute", + "execute", + "execute", + "write_xlsx", "output_text" ], "description": [ diff --git a/implementations/report_generation/demo.py b/implementations/report_generation/demo.py index 4cba494..5902a10 100644 --- a/implementations/report_generation/demo.py +++ b/implementations/report_generation/demo.py @@ -11,14 +11,15 @@ from functools import partial from typing import Any, AsyncGenerator -import agents import click import gradio as gr from aieng.agent_evals.async_client_manager import AsyncClientManager from aieng.agent_evals.report_generation.agent import get_report_generation_agent from aieng.agent_evals.report_generation.prompts import MAIN_AGENT_INSTRUCTIONS -from aieng.agent_evals.utils import get_or_create_session, oai_agent_stream_to_gradio_messages from dotenv import load_dotenv +from google.adk.runners import Runner +from google.adk.sessions import InMemorySessionService +from google.genai.types import Content, Part from gradio.components.chatbot import ChatMessage from implementations.report_generation.env_vars import ( @@ -26,6 +27,7 @@ get_reports_output_path, get_sqlite_db_path, ) +from implementations.report_generation.gradio_utils import agent_event_to_gradio_messages load_dotenv(verbose=True) @@ -61,12 +63,6 @@ async def agent_session_handler( # Initialize list of chat messages for a single turn turn_messages: list[ChatMessage] = [] - # Construct an in-memory SQLite session for the agent to maintain - # conversation history across multiple turns of a chat - # This makes it possible to ask follow-up questions that refer to - # previous turns in the conversation - session = get_or_create_session(history, session_state) - main_agent = get_report_generation_agent( instructions=MAIN_AGENT_INSTRUCTIONS, sqlite_db_path=get_sqlite_db_path(), @@ -74,13 +70,30 @@ async def agent_session_handler( langfuse_project_name=get_langfuse_project_name() if enable_trace else None, ) - # Run the agent in streaming mode to get and display intermediate outputs - result_stream = agents.Runner.run_streamed(main_agent, input=query, session=session) + # Construct an in-memory session for the agent to maintain + # conversation history across multiple turns of a chat + # This makes it possible to ask follow-up questions that refer to + # previous turns in the conversation + session_service = InMemorySessionService() + runner = Runner(app_name=main_agent.name, agent=main_agent, session_service=session_service) + current_session = await session_service.create_session( + app_name=main_agent.name, + user_id="user", + state={}, + ) - async for _item in result_stream.stream_events(): + # create the user message + content = Content(role="user", parts=[Part(text=query)]) + + # Run the agent in streaming mode to get and display intermediate outputs + async for event in runner.run_async( + user_id="user", + session_id=current_session.id, + new_message=content, + ): # Parse the stream events, convert to Gradio chat messages and append to # the chat history - turn_messages += oai_agent_stream_to_gradio_messages(_item) + turn_messages += agent_event_to_gradio_messages(event) if len(turn_messages) > 0: yield turn_messages diff --git a/implementations/report_generation/gradio_utils.py b/implementations/report_generation/gradio_utils.py new file mode 100644 index 0000000..7aad3b9 --- /dev/null +++ b/implementations/report_generation/gradio_utils.py @@ -0,0 +1,70 @@ +"""Utility functions for the report generation agent.""" + +import logging + +from aieng.agent_evals.report_generation.agent import EventParser, EventType +from google.adk.events.event import Event +from gradio.components.chatbot import ChatMessage, MetadataDict + + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") +logger = logging.getLogger(__name__) + + +def agent_event_to_gradio_messages(event: Event) -> list[ChatMessage]: + """Parse a Google ADK Event into a list of gr messages. + + Adds extra data for tool use to make the gradio display informative. + + Parameters + ---------- + event : Event + The event from the Google ADK's agent response. + + Returns + ------- + list[ChatMessage] + A list of Gradio chat messages parsed from the stream event. + """ + output: list[ChatMessage] = [] + + parsed_events = EventParser.parse(event) + + for parsed_event in parsed_events: + if parsed_event.type == EventType.FINAL_RESPONSE: + output.append( + ChatMessage( + role="assistant", + content=parsed_event.text, + metadata=MetadataDict(), + ) + ) + elif parsed_event.type == EventType.TOOL_CALL: + output.append( + ChatMessage( + role="assistant", + content=f"```\n{parsed_event.arguments}\n```", + metadata={"title": f"🛠️ Used tool `{parsed_event.text}`"}, + ) + ) + elif parsed_event.type == EventType.THOUGHT: + output.append( + ChatMessage( + role="assistant", + content=parsed_event.text or "", + metadata={"title": "🧠 Thought"}, + ) + ) + elif parsed_event.type == EventType.TOOL_RESPONSE: + output.append( + ChatMessage( + role="assistant", + content=f"```\n{parsed_event.arguments}\n```", + metadata={ + "title": f"*Tool call output: `{parsed_event.text}`*", + "status": "done", # This makes it collapsed by default + }, + ) + ) + + return output