Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions aieng-eval-agents/aieng/agent_evals/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,17 @@ class Configs(BaseSettings):
web_search_base_url: str | None = Field(default=None, description="Base URL for web search service.")
web_search_api_key: SecretStr | None = Field(default=None, description="API key for web search service.")

# === Report Generation ===
# Defaults are set in the implementations/report_generation/env_vars.py file
report_generation_output_path: str | None = Field(
default=None,
description="Path to the directory where the report generation agent will save the reports.",
)
report_generation_langfuse_project_name: str | None = Field(
default=None,
description="Name of the Langfuse project to use for report generation.",
)

# Validators for the SecretStr fields
@field_validator("langfuse_secret_key")
@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
def get_report_generation_agent(
instructions: str,
reports_output_path: Path,
langfuse_project_name: str | None,
langfuse_project_name: str | None = None,
after_agent_callback: AfterAgentCallback | None = None,
) -> Agent:
"""
Expand All @@ -49,11 +49,12 @@ def get_report_generation_agent(
The instructions for the agent.
reports_output_path : Path
The path to the reports output directory.
langfuse_project_name : str | None
langfuse_project_name : str | None, optional
The name of the Langfuse project to use for tracing.
after_agent_callback : AfterAgentCallback | None
Default is None, which means no tracing will be used.
after_agent_callback : AfterAgentCallback | None, optional
The callback function to be called after the agent has
finished executing.
finished executing. Default is None.

Returns
-------
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Tests for the offline evaluation of the report generation agent."""

from pathlib import Path
from unittest.mock import ANY, Mock, patch

import pytest
from aieng.agent_evals.report_generation.evaluation.offline import (
evaluate,
final_result_evaluator,
trajectory_evaluator,
)


@patch("aieng.agent_evals.report_generation.evaluation.offline.AsyncClientManager.get_instance")
@patch("aieng.agent_evals.report_generation.evaluation.offline.DbManager.get_instance")
@pytest.mark.asyncio
async def test_evaluate(mock_db_manager_instance, mock_async_client_manager_instance):
"""Test the evaluate function."""
test_dataset_name = "test_dataset"
test_reports_output_path = Path("reports/")
test_langfuse_project_name = "test_project"
test_max_concurrency = 5

mock_result = Mock()
mock_dataset = Mock()
mock_dataset.run_experiment.return_value = mock_result
mock_langfuse_client = Mock()
mock_langfuse_client.get_dataset.return_value = mock_dataset
mock_async_client_manager_instance.return_value = Mock()
mock_async_client_manager_instance.return_value.langfuse_client = mock_langfuse_client

mock_db_manager_instance.return_value = Mock()

await evaluate(
dataset_name=test_dataset_name,
reports_output_path=test_reports_output_path,
langfuse_project_name=test_langfuse_project_name,
max_concurrency=test_max_concurrency,
)

mock_dataset.run_experiment.assert_called_once_with(
name="Evaluate Report Generation Agent",
description="Evaluate the Report Generation Agent with data from Langfuse",
task=ANY,
evaluators=[final_result_evaluator, trajectory_evaluator],
max_concurrency=test_max_concurrency,
)

task = mock_dataset.run_experiment.call_args_list[0][1]["task"]
assert task.__name__ == "run"
assert task.__self__.__class__.__name__ == "ReportGenerationTask"
assert task.__self__.reports_output_path == test_reports_output_path
assert task.__self__.langfuse_project_name == test_langfuse_project_name

mock_db_manager_instance.return_value.close.assert_called_once()
mock_async_client_manager_instance.return_value.close.assert_called_once()
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""Tests for the online evaluation of the report generation agent."""

from unittest.mock import Mock, patch

import pytest
from aieng.agent_evals.report_generation.evaluation.online import report_final_response_score


@patch("aieng.agent_evals.report_generation.evaluation.online.AsyncClientManager.get_instance")
def test_report_final_response_score_positive_score(mock_async_client_manager_instance):
"""Test the report_final_response_score function with a positive score."""
test_string_match = "string-to-match"
test_trace_id = "test_trace_id"

mock_langfuse_client = Mock()
mock_langfuse_client.get_current_trace_id.return_value = test_trace_id

mock_async_client_manager_instance.return_value = Mock()
mock_async_client_manager_instance.return_value.langfuse_client = mock_langfuse_client

mock_event = Mock()
mock_event.is_final_response.return_value = True
mock_event.content = Mock()
mock_event.content.parts = [
Mock(text=f"test_final_response_text {test_string_match} test_final_response_text"),
]

report_final_response_score(mock_event, string_match=test_string_match)

mock_langfuse_client.create_score.assert_called_once_with(
name="Valid Final Response",
value=1,
trace_id=test_trace_id,
comment="Final response contains the string match.",
metadata={
"final_response": mock_event.content.parts[0].text,
"string_match": test_string_match,
},
)
mock_langfuse_client.flush.assert_called_once()


@patch("aieng.agent_evals.report_generation.evaluation.online.AsyncClientManager.get_instance")
def test_report_final_response_score_negative_score(mock_async_client_manager_instance):
"""Test the report_final_response_score function with a negative score."""
test_string_match = "string-to-match"
test_trace_id = "test_trace_id"

mock_langfuse_client = Mock()
mock_langfuse_client.get_current_trace_id.return_value = test_trace_id

mock_async_client_manager_instance.return_value = Mock()
mock_async_client_manager_instance.return_value.langfuse_client = mock_langfuse_client

mock_event = Mock()
mock_event.is_final_response.return_value = True
mock_event.content = Mock()
mock_event.content.parts = [
Mock(text="test_final_response_text test_final_response_text"),
]

report_final_response_score(mock_event, string_match=test_string_match)

mock_langfuse_client.create_score.assert_called_once_with(
name="Valid Final Response",
value=0,
trace_id=test_trace_id,
comment="Final response does not contains the string match.",
metadata={
"final_response": mock_event.content.parts[0].text,
"string_match": test_string_match,
},
)
mock_langfuse_client.flush.assert_called_once()


@patch("aieng.agent_evals.report_generation.evaluation.online.AsyncClientManager.get_instance")
def test_report_final_response_invalid(mock_async_client_manager_instance):
"""Test the report_final_response_score function with a negative score."""
test_string_match = "string-to-match"
test_trace_id = "test_trace_id"

mock_langfuse_client = Mock()
mock_langfuse_client.get_current_trace_id.return_value = test_trace_id

mock_async_client_manager_instance.return_value = Mock()
mock_async_client_manager_instance.return_value.langfuse_client = mock_langfuse_client

mock_event = Mock()
mock_event.is_final_response.return_value = True
mock_event.content = Mock()
mock_event.content.parts = [Mock(text=None)]

report_final_response_score(mock_event, string_match=test_string_match)

mock_langfuse_client.create_score.assert_called_once_with(
name="Valid Final Response",
value=0,
trace_id=test_trace_id,
comment="Final response not found in the event",
metadata={
"string_match": test_string_match,
},
)
mock_langfuse_client.flush.assert_called_once()


def test_report_final_response_not_final_response():
"""Test raising an error when the event is not a final response."""
mock_event = Mock()
mock_event.is_final_response.return_value = False

with pytest.raises(ValueError, match="Event is not a final response"):
report_final_response_score(mock_event)


@patch("aieng.agent_evals.report_generation.evaluation.online.AsyncClientManager.get_instance")
def test_report_final_response_langfuse_trace_id_none(mock_async_client_manager_instance):
"""Test raising an error when the Langfuse trace ID is None."""
mock_langfuse_client = Mock()
mock_langfuse_client.get_current_trace_id.return_value = None

mock_async_client_manager_instance.return_value = Mock()
mock_async_client_manager_instance.return_value.langfuse_client = mock_langfuse_client

with pytest.raises(ValueError, match="Langfuse trace ID is None."):
report_final_response_score(Mock())
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
"""Tests for the report generation agent."""

import logging
import shutil
from pathlib import Path
from unittest.mock import Mock, patch

import pytest
from aieng.agent_evals.configs import Configs
from aieng.agent_evals.report_generation.agent import EventParser, EventType, get_report_generation_agent


logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
logger = logging.getLogger(__name__)


@pytest.fixture
def setup_dotenv():
"""Copy .env.example to .env for the test run, then remove it in teardown."""
root_dir = Path.cwd()
env_dir = root_dir
while env_dir.name != "eval-agents":
env_dir = env_dir.parent

env_exists = Path(".env").exists()
if env_exists:
# Moving existing .env to .env.bkp
shutil.move(".env", ".env.bkp")

shutil.copy(env_dir / ".env.example", ".env")

yield

Path(".env").unlink()
if env_exists:
# Moving the existing .env back
shutil.move(".env.bkp", ".env")


@patch("aieng.agent_evals.report_generation.agent.init_tracing")
def test_get_report_generation_agent_with_langfuse(mock_init_tracing, setup_dotenv):
"""Test the get_report_generation_agent function."""
test_instructions = "You are a report generation agent."
test_langfuse_project_name = "test_langfuse_project_name"
test_reports_output_path = Path("reports/")
test_after_agent_callback = Mock()

agent = get_report_generation_agent(
instructions=test_instructions,
reports_output_path=test_reports_output_path,
langfuse_project_name=test_langfuse_project_name,
after_agent_callback=test_after_agent_callback,
)

assert agent.name == "ReportGenerationAgent"
assert agent.model == Configs().default_worker_model
assert agent.instruction == test_instructions
assert [tool.__name__ for tool in agent.tools] == ["get_schema_info", "execute", "write_xlsx"]
assert agent.tools[2].__self__.reports_output_path == test_reports_output_path
assert agent.after_agent_callback == test_after_agent_callback

mock_init_tracing.assert_called_once_with(test_langfuse_project_name)


@patch("aieng.agent_evals.report_generation.agent.init_tracing")
def test_get_report_generation_agent_without_langfuse(mock_init_tracing, setup_dotenv):
"""Test the get_report_generation_agent function."""
test_instructions = "You are a report generation agent."
test_reports_output_path = Path("reports/")

agent = get_report_generation_agent(
instructions=test_instructions,
reports_output_path=test_reports_output_path,
)

assert agent is not None
mock_init_tracing.assert_not_called()


def test_parse_event_final_response():
"""Test the event parser when the event is a final response."""
test_final_response_text = "Hello, world!"

mock_event = Mock()
mock_event.is_final_response.return_value = True
mock_event.content = Mock()
mock_event.content.parts = [Mock(text=test_final_response_text)]

parsed_events = EventParser.parse(mock_event)

assert len(parsed_events) == 1
assert parsed_events[0].type == EventType.FINAL_RESPONSE
assert parsed_events[0].text == test_final_response_text


def test_parse_event_invalid_final_response():
"""Test the event parser when the event is a final response but is invalid."""
mock_event = Mock()
mock_event.is_final_response.return_value = True
mock_event.content = None

parsed_events = EventParser.parse(mock_event)
assert len(parsed_events) == 0

mock_event.content = Mock()
mock_event.content.parts = None

parsed_events = EventParser.parse(mock_event)
assert len(parsed_events) == 0

mock_event.content.parts = []

parsed_events = EventParser.parse(mock_event)
assert len(parsed_events) == 0

mock_event.content.parts = [Mock(text=None)]

parsed_events = EventParser.parse(mock_event)
assert len(parsed_events) == 0


def test_parse_event_model_response():
"""Test the event parser when the event is a model response."""
mock_event = Mock()
mock_event.is_final_response.return_value = False
mock_event.content = Mock()
mock_event.content.role = "model"
function_call_mock = Mock()
function_call_mock.name = "test_function_call_name"
function_call_mock.args = "test_args"
mock_event.content.parts = [
Mock(function_call=function_call_mock),
Mock(function_call=None, thought_signature="test_thought_signature", text="test thought text"),
]

parsed_events = EventParser.parse(mock_event)

assert parsed_events[0].type == EventType.TOOL_CALL
assert parsed_events[0].text == mock_event.content.parts[0].function_call.name
assert parsed_events[0].arguments == mock_event.content.parts[0].function_call.args
assert parsed_events[1].type == EventType.THOUGHT
assert parsed_events[1].text == mock_event.content.parts[1].text


def test_parse_event_user_response():
"""Test the event parser when the event is a user response."""
mock_event = Mock()
mock_event.is_final_response.return_value = False
mock_event.content = Mock()
mock_event.content.role = "user"
function_response_mock = Mock()
function_response_mock.name = "test_function_response_name"
function_response_mock.response = "test_response"
mock_event.content.parts = [Mock(function_response=function_response_mock)]

parsed_events = EventParser.parse(mock_event)

assert len(parsed_events) == 1
assert parsed_events[0].type == EventType.TOOL_RESPONSE
assert parsed_events[0].text == function_response_mock.name
assert parsed_events[0].arguments == function_response_mock.response
Loading