VectorInstitute · lotif · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/aieng-eval-agents/aieng/agent_evals/configs.py b/aieng-eval-agents/aieng/agent_evals/configs.py
@@ -161,6 +161,17 @@ class Configs(BaseSettings):
     web_search_base_url: str | None = Field(default=None, description="Base URL for web search service.")
     web_search_api_key: SecretStr | None = Field(default=None, description="API key for web search service.")
 
+    # === Report Generation ===
+    # Defaults are set in the implementations/report_generation/env_vars.py file
+    report_generation_output_path: str | None = Field(
+        default=None,
+        description="Path to the directory where the report generation agent will save the reports.",
+    )
+    report_generation_langfuse_project_name: str | None = Field(
+        default=None,
+        description="Name of the Langfuse project to use for report generation.",
+    )
+
     # Validators for the SecretStr fields
     @field_validator("langfuse_secret_key")
     @classmethod

diff --git a/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py b/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py
@@ -37,7 +37,7 @@
 def get_report_generation_agent(
     instructions: str,
     reports_output_path: Path,
-    langfuse_project_name: str | None,
+    langfuse_project_name: str | None = None,
     after_agent_callback: AfterAgentCallback | None = None,
 ) -> Agent:
     """
@@ -49,11 +49,12 @@ def get_report_generation_agent(
         The instructions for the agent.
     reports_output_path : Path
         The path to the reports output directory.
-    langfuse_project_name : str | None
+    langfuse_project_name : str | None, optional
         The name of the Langfuse project to use for tracing.
-    after_agent_callback : AfterAgentCallback | None
+        Default is None, which means no tracing will be used.
+    after_agent_callback : AfterAgentCallback | None, optional
         The callback function to be called after the agent has
-        finished executing.
+        finished executing. Default is None.
 
     Returns
     -------

diff --git a/aieng-eval-agents/tests/aieng/agent_evals/report_generation/__init__.py b/aieng-eval-agents/tests/aieng/agent_evals/report_generation/__init__.py
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/report_generation/evaluation/__init__.py b/aieng-eval-agents/tests/aieng/agent_evals/report_generation/evaluation/__init__.py
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/report_generation/evaluation/test_offline.py b/aieng-eval-agents/tests/aieng/agent_evals/report_generation/evaluation/test_offline.py
@@ -0,0 +1,56 @@
+"""Tests for the offline evaluation of the report generation agent."""
+
+from pathlib import Path
+from unittest.mock import ANY, Mock, patch
+
+import pytest
+from aieng.agent_evals.report_generation.evaluation.offline import (
+    evaluate,
+    final_result_evaluator,
+    trajectory_evaluator,
+)
+
+
+@patch("aieng.agent_evals.report_generation.evaluation.offline.AsyncClientManager.get_instance")
+@patch("aieng.agent_evals.report_generation.evaluation.offline.DbManager.get_instance")
+@pytest.mark.asyncio
+async def test_evaluate(mock_db_manager_instance, mock_async_client_manager_instance):
+    """Test the evaluate function."""
+    test_dataset_name = "test_dataset"
+    test_reports_output_path = Path("reports/")
+    test_langfuse_project_name = "test_project"
+    test_max_concurrency = 5
+
+    mock_result = Mock()
+    mock_dataset = Mock()
+    mock_dataset.run_experiment.return_value = mock_result
+    mock_langfuse_client = Mock()
+    mock_langfuse_client.get_dataset.return_value = mock_dataset
+    mock_async_client_manager_instance.return_value = Mock()
+    mock_async_client_manager_instance.return_value.langfuse_client = mock_langfuse_client
+
+    mock_db_manager_instance.return_value = Mock()
+
+    await evaluate(
+        dataset_name=test_dataset_name,
+        reports_output_path=test_reports_output_path,
+        langfuse_project_name=test_langfuse_project_name,
+        max_concurrency=test_max_concurrency,
+    )
+
+    mock_dataset.run_experiment.assert_called_once_with(
+        name="Evaluate Report Generation Agent",
+        description="Evaluate the Report Generation Agent with data from Langfuse",
+        task=ANY,
+        evaluators=[final_result_evaluator, trajectory_evaluator],
+        max_concurrency=test_max_concurrency,
+    )
+
+    task = mock_dataset.run_experiment.call_args_list[0][1]["task"]
+    assert task.__name__ == "run"
+    assert task.__self__.__class__.__name__ == "ReportGenerationTask"
+    assert task.__self__.reports_output_path == test_reports_output_path
+    assert task.__self__.langfuse_project_name == test_langfuse_project_name
+
+    mock_db_manager_instance.return_value.close.assert_called_once()
+    mock_async_client_manager_instance.return_value.close.assert_called_once()
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/report_generation/evaluation/test_online.py b/aieng-eval-agents/tests/aieng/agent_evals/report_generation/evaluation/test_online.py
@@ -0,0 +1,127 @@
+"""Tests for the online evaluation of the report generation agent."""
+
+from unittest.mock import Mock, patch
+
+import pytest
+from aieng.agent_evals.report_generation.evaluation.online import report_final_response_score
+
+
+@patch("aieng.agent_evals.report_generation.evaluation.online.AsyncClientManager.get_instance")
+def test_report_final_response_score_positive_score(mock_async_client_manager_instance):
+    """Test the report_final_response_score function with a positive score."""
+    test_string_match = "string-to-match"
+    test_trace_id = "test_trace_id"
+
+    mock_langfuse_client = Mock()
+    mock_langfuse_client.get_current_trace_id.return_value = test_trace_id
+
+    mock_async_client_manager_instance.return_value = Mock()
+    mock_async_client_manager_instance.return_value.langfuse_client = mock_langfuse_client
+
+    mock_event = Mock()
+    mock_event.is_final_response.return_value = True
+    mock_event.content = Mock()
+    mock_event.content.parts = [
+        Mock(text=f"test_final_response_text {test_string_match} test_final_response_text"),
+    ]
+
+    report_final_response_score(mock_event, string_match=test_string_match)
+
+    mock_langfuse_client.create_score.assert_called_once_with(
+        name="Valid Final Response",
+        value=1,
+        trace_id=test_trace_id,
+        comment="Final response contains the string match.",
+        metadata={
+            "final_response": mock_event.content.parts[0].text,
+            "string_match": test_string_match,
+        },
+    )
+    mock_langfuse_client.flush.assert_called_once()
+
+
+@patch("aieng.agent_evals.report_generation.evaluation.online.AsyncClientManager.get_instance")
+def test_report_final_response_score_negative_score(mock_async_client_manager_instance):
+    """Test the report_final_response_score function with a negative score."""
+    test_string_match = "string-to-match"
+    test_trace_id = "test_trace_id"
+
+    mock_langfuse_client = Mock()
+    mock_langfuse_client.get_current_trace_id.return_value = test_trace_id
+
+    mock_async_client_manager_instance.return_value = Mock()
+    mock_async_client_manager_instance.return_value.langfuse_client = mock_langfuse_client
+
+    mock_event = Mock()
+    mock_event.is_final_response.return_value = True
+    mock_event.content = Mock()
+    mock_event.content.parts = [
+        Mock(text="test_final_response_text test_final_response_text"),
+    ]
+
+    report_final_response_score(mock_event, string_match=test_string_match)
+
+    mock_langfuse_client.create_score.assert_called_once_with(
+        name="Valid Final Response",
+        value=0,
+        trace_id=test_trace_id,
+        comment="Final response does not contains the string match.",
+        metadata={
+            "final_response": mock_event.content.parts[0].text,
+            "string_match": test_string_match,
+        },
+    )
+    mock_langfuse_client.flush.assert_called_once()
+
+
+@patch("aieng.agent_evals.report_generation.evaluation.online.AsyncClientManager.get_instance")
+def test_report_final_response_invalid(mock_async_client_manager_instance):
+    """Test the report_final_response_score function with a negative score."""
+    test_string_match = "string-to-match"
+    test_trace_id = "test_trace_id"
+
+    mock_langfuse_client = Mock()
+    mock_langfuse_client.get_current_trace_id.return_value = test_trace_id
+
+    mock_async_client_manager_instance.return_value = Mock()
+    mock_async_client_manager_instance.return_value.langfuse_client = mock_langfuse_client
+
+    mock_event = Mock()
+    mock_event.is_final_response.return_value = True
+    mock_event.content = Mock()
+    mock_event.content.parts = [Mock(text=None)]
+
+    report_final_response_score(mock_event, string_match=test_string_match)
+
+    mock_langfuse_client.create_score.assert_called_once_with(
+        name="Valid Final Response",
+        value=0,
+        trace_id=test_trace_id,
+        comment="Final response not found in the event",
+        metadata={
+            "string_match": test_string_match,
+        },
+    )
+    mock_langfuse_client.flush.assert_called_once()
+
+
+def test_report_final_response_not_final_response():
+    """Test raising an error when the event is not a final response."""
+    mock_event = Mock()
+    mock_event.is_final_response.return_value = False
+
+    with pytest.raises(ValueError, match="Event is not a final response"):
+        report_final_response_score(mock_event)
+
+
+@patch("aieng.agent_evals.report_generation.evaluation.online.AsyncClientManager.get_instance")
+def test_report_final_response_langfuse_trace_id_none(mock_async_client_manager_instance):
+    """Test raising an error when the Langfuse trace ID is None."""
+    mock_langfuse_client = Mock()
+    mock_langfuse_client.get_current_trace_id.return_value = None
+
+    mock_async_client_manager_instance.return_value = Mock()
+    mock_async_client_manager_instance.return_value.langfuse_client = mock_langfuse_client
+
+    with pytest.raises(ValueError, match="Langfuse trace ID is None."):
+        report_final_response_score(Mock())
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/report_generation/test_agent.py b/aieng-eval-agents/tests/aieng/agent_evals/report_generation/test_agent.py
@@ -0,0 +1,161 @@
+"""Tests for the report generation agent."""
+
+import logging
+import shutil
+from pathlib import Path
+from unittest.mock import Mock, patch
+
+import pytest
+from aieng.agent_evals.configs import Configs
+from aieng.agent_evals.report_generation.agent import EventParser, EventType, get_report_generation_agent
+
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture
+def setup_dotenv():
+    """Copy .env.example to .env for the test run, then remove it in teardown."""
+    root_dir = Path.cwd()
+    env_dir = root_dir
+    while env_dir.name != "eval-agents":
+        env_dir = env_dir.parent
+
+    env_exists = Path(".env").exists()
+    if env_exists:
+        # Moving existing .env to .env.bkp
+        shutil.move(".env", ".env.bkp")
+
+    shutil.copy(env_dir / ".env.example", ".env")
+
+    yield
+
+    Path(".env").unlink()
+    if env_exists:
+        # Moving the existing .env back
+        shutil.move(".env.bkp", ".env")
+
+
+@patch("aieng.agent_evals.report_generation.agent.init_tracing")
+def test_get_report_generation_agent_with_langfuse(mock_init_tracing, setup_dotenv):
+    """Test the get_report_generation_agent function."""
+    test_instructions = "You are a report generation agent."
+    test_langfuse_project_name = "test_langfuse_project_name"
+    test_reports_output_path = Path("reports/")
+    test_after_agent_callback = Mock()
+
+    agent = get_report_generation_agent(
+        instructions=test_instructions,
+        reports_output_path=test_reports_output_path,
+        langfuse_project_name=test_langfuse_project_name,
+        after_agent_callback=test_after_agent_callback,
+    )
+
+    assert agent.name == "ReportGenerationAgent"
+    assert agent.model == Configs().default_worker_model
+    assert agent.instruction == test_instructions
+    assert [tool.__name__ for tool in agent.tools] == ["get_schema_info", "execute", "write_xlsx"]
+    assert agent.tools[2].__self__.reports_output_path == test_reports_output_path
+    assert agent.after_agent_callback == test_after_agent_callback
+
+    mock_init_tracing.assert_called_once_with(test_langfuse_project_name)
+
+
+@patch("aieng.agent_evals.report_generation.agent.init_tracing")
+def test_get_report_generation_agent_without_langfuse(mock_init_tracing, setup_dotenv):
+    """Test the get_report_generation_agent function."""
+    test_instructions = "You are a report generation agent."
+    test_reports_output_path = Path("reports/")
+
+    agent = get_report_generation_agent(
+        instructions=test_instructions,
+        reports_output_path=test_reports_output_path,
+    )
+
+    assert agent is not None
+    mock_init_tracing.assert_not_called()
+
+
+def test_parse_event_final_response():
+    """Test the event parser when the event is a final response."""
+    test_final_response_text = "Hello, world!"
+
+    mock_event = Mock()
+    mock_event.is_final_response.return_value = True
+    mock_event.content = Mock()
+    mock_event.content.parts = [Mock(text=test_final_response_text)]
+
+    parsed_events = EventParser.parse(mock_event)
+
+    assert len(parsed_events) == 1
+    assert parsed_events[0].type == EventType.FINAL_RESPONSE
+    assert parsed_events[0].text == test_final_response_text
+
+
+def test_parse_event_invalid_final_response():
+    """Test the event parser when the event is a final response but is invalid."""
+    mock_event = Mock()
+    mock_event.is_final_response.return_value = True
+    mock_event.content = None
+
+    parsed_events = EventParser.parse(mock_event)
+    assert len(parsed_events) == 0
+
+    mock_event.content = Mock()
+    mock_event.content.parts = None
+
+    parsed_events = EventParser.parse(mock_event)
+    assert len(parsed_events) == 0
+
+    mock_event.content.parts = []
+
+    parsed_events = EventParser.parse(mock_event)
+    assert len(parsed_events) == 0
+
+    mock_event.content.parts = [Mock(text=None)]
+
+    parsed_events = EventParser.parse(mock_event)
+    assert len(parsed_events) == 0
+
+
+def test_parse_event_model_response():
+    """Test the event parser when the event is a model response."""
+    mock_event = Mock()
+    mock_event.is_final_response.return_value = False
+    mock_event.content = Mock()
+    mock_event.content.role = "model"
+    function_call_mock = Mock()
+    function_call_mock.name = "test_function_call_name"
+    function_call_mock.args = "test_args"
+    mock_event.content.parts = [
+        Mock(function_call=function_call_mock),
+        Mock(function_call=None, thought_signature="test_thought_signature", text="test thought text"),
+    ]
+
+    parsed_events = EventParser.parse(mock_event)
+
+    assert parsed_events[0].type == EventType.TOOL_CALL
+    assert parsed_events[0].text == mock_event.content.parts[0].function_call.name
+    assert parsed_events[0].arguments == mock_event.content.parts[0].function_call.args
+    assert parsed_events[1].type == EventType.THOUGHT
+    assert parsed_events[1].text == mock_event.content.parts[1].text
+
+
+def test_parse_event_user_response():
+    """Test the event parser when the event is a user response."""
+    mock_event = Mock()
+    mock_event.is_final_response.return_value = False
+    mock_event.content = Mock()
+    mock_event.content.role = "user"
+    function_response_mock = Mock()
+    function_response_mock.name = "test_function_response_name"
+    function_response_mock.response = "test_response"
+    mock_event.content.parts = [Mock(function_response=function_response_mock)]
+
+    parsed_events = EventParser.parse(mock_event)
+
+    assert len(parsed_events) == 1
+    assert parsed_events[0].type == EventType.TOOL_RESPONSE
+    assert parsed_events[0].text == function_response_mock.name
+    assert parsed_events[0].arguments == function_response_mock.response