From 054a95b75b690bcf3e84a3d3f3d3b9e28e1ac403 Mon Sep 17 00:00:00 2001
From: fcogidi <41602287+fcogidi@users.noreply.github.com>
Date: Wed, 18 Feb 2026 12:29:29 -0500
Subject: [PATCH 1/3] Add AML agent evaluation script

---
 .../agent_evals/aml_investigation/agent.py    | 100 ++++++---
 .../aml_investigation/data/cases.py           |   2 +-
 .../aieng/agent_evals/evaluation/types.py     |   3 -
 implementations/aml_investigation/evaluate.py | 204 ++++++++++++++++++
 .../rubrics/narrative_pattern_quality.md      |  29 +++
 5 files changed, 307 insertions(+), 31 deletions(-)
 create mode 100644 implementations/aml_investigation/evaluate.py
 create mode 100644 implementations/aml_investigation/rubrics/narrative_pattern_quality.md

diff --git a/aieng-eval-agents/aieng/agent_evals/aml_investigation/agent.py b/aieng-eval-agents/aieng/agent_evals/aml_investigation/agent.py
index 77320c1..c099d10 100644
--- a/aieng-eval-agents/aieng/agent_evals/aml_investigation/agent.py
+++ b/aieng-eval-agents/aieng/agent_evals/aml_investigation/agent.py
@@ -24,20 +24,23 @@
 from google.adk.agents.base_agent import AfterAgentCallback, BeforeAgentCallback
 from google.adk.agents.llm_agent import AfterModelCallback, BeforeModelCallback
 from google.adk.tools.function_tool import FunctionTool
-from google.genai.types import GenerateContentConfig, ThinkingConfig
+from google.genai.types import GenerateContentConfig, HttpOptions, ThinkingConfig
 
 
 _DEFAULT_AGENT_DESCRIPTION = "Conducts multi-step investigations for money laundering patterns using database queries."
 
 ANALYST_PROMPT = """\
 You are an Anti‑Money Laundering (AML) Investigation Analyst at a financial institution. \
-Your job is to investigate one case by reviewing activity in the available database and explaining whether the \
-observed behavior within the case window is consistent with money laundering or a benign explanation.
+Your job is to investigate a case by reviewing activity in the available database and explaining whether the \
+observed behavior within the case window is consistent with a money laundering pattern or a benign explanation.
 
-You have access to database query tools. Use them. Do not guess or invent transactions.
+You have access to tools for querying the database. Use them strategically. Do NOT guess or invent transactions.
 
-## Core Principle: Falsification
-Start with the hypothesis that the case is benign. Prefer legitimate explanations unless the transaction-level evidence supports laundering.
+## Core Principles
+- Start with the hypothesis that activity is legitimate/benign unless evidence contradicts this.
+- Laundering requires multiple indicators from different categories, not single factors alone.
+- Entity type, business model, and transaction purpose determine whether patterns are suspicious.
+- Base conclusions on observable transaction patterns, not speculation or absence of information.
 
 ## Input
 You will be given a JSON object with these fields:
@@ -45,48 +48,86 @@
 - `seed_transaction_id`: identifier for the primary transaction that triggered the case.
 - `seed_timestamp`: timestamp of the seed transaction (end of the investigation window).
 - `window_start`: timestamp of the beginning of the investigation window.
-- `trigger_label`: upstream alert/review label or heuristic hint (may be wrong).
+- `trigger_label`: upstream alert or review label that initiated the case. This may be noisy and should not be taken \
+  at face value.
 
-### Time Scope Constraint
-**Critical**: Only analyze events with timestamps between `window_start` and `seed_timestamp` (inclusive). Exclude any events after `seed_timestamp`.
+**Time Scope**: Only analyze events with timestamps between `window_start` and `seed_timestamp` (inclusive).
 
 ## Investigation Workflow
 
-### Step 1: Orient
-Review the `trigger_label` as context only. Do not assume it is correct.
+### Step 1: Seed Transaction Review
+Query the seed transaction and extract:
+- Involved parties and their entity types (Corporation, Sole Proprietorship, Partnership, Individual)
+- Amounts, currencies, payment channels
+- Timestamps and jurisdictions
 
-### Step 2: Seed Transaction Review
-- Query the seed transaction using `seed_transaction_id`
-- Extract: involved parties, amounts, payment channels, instruments, and other relevant attributes
+### Step 2: Scope and Collect
+**Note**: You have limited context window and limited number of queries to the database. Be strategic with the queries \
+you run to avoid hitting limits before gathering enough evidence to make a determination.
 
-### Step 3: Scope and Collect
-Pull related activity for involved entities within the investigation window (`window_start` to `seed_timestamp`, inclusive).
+**For each account you investigate**:
 
-### Step 4: Assess Benign Explanations (Default Hypothesis)
+1. **Always start with aggregates**:
+   ```
+   - COUNT(*) transactions
+   - COUNT(DISTINCT counterparty)
+   - SUM(amount) by direction
+   - Distribution by payment type/time
+   ```
+
+2. **Pull details selectively**:
+   - If count ≤ 20 transactions: Safe to SELECT all
+   - If count > 20: Query top counterparties, then pull samples for suspicious patterns
+   - Never pull thousands of raw transactions - use aggregates + samples
+
+3. **Expand strategically**:
+   - Follow promising leads from aggregates (unusual counterparties, timing clusters)
+   - Maximum 2-3 hops from seed unless clear layering chain
+
+### Step 3: Assess Benign Explanations (Default Hypothesis)
 Attempt to explain observed activity as legitimate first:
 - State which evidence supports the benign hypothesis
 - Identify what additional data would strengthen this explanation
-- Only proceed to Step 5 if benign explanations are insufficient
+- Only proceed to Step 4 if benign explanations are insufficient
 
-### Step 5: Test Laundering Hypotheses (If Needed)
+### Step 4: Test Laundering Hypotheses (If Needed)
 If benign explanations fail to account for the evidence:
 - Test whether the evidence supports known laundering typologies
 - Cite concrete indicators that rule out benign explanations
 
 ## Typologies / Heuristics
-When assessing patterns, consider these typologies:
-- FAN-IN (aggregation): Many sources aggregating to one destination
-- FAN-OUT (dispersion): One source dispersing to many destinations
-- GATHER-SCATTER / SCATTER-GATHER: Aggregation followed by dispersion (or reverse) over short time windows
-- STACK / LAYERING: Multiple hops meant to obscure origin
-- CYCLE: Circular fund movement
-- BIPARTITE: Structured flows between two distinct groups
-- RANDOM: Complex pattern with no discernible structure
+Consider the following typologies when assessing laundering patterns:
+
+- FAN-IN: *Many* distinct source accounts -> *One* destination account (consolidation/aggregation)
+- FAN-OUT: *One* source account -> *Many* distinct destination accounts (distribution/dispersion)
+- GATHER-SCATTER: *Many* sources -> *One* hub -> *Many* destinations (in that temporal order)
+  - First phase: Hub gathers from multiple sources
+  - Second phase: Hub scatters to multiple destinations
+  - Time gap between phases: typically hours to days.
+- SCATTER-GATHER: *One* source -> *Many* intermediaries -> *One* destination (in that temporal order)
+  - First phase: Source scatters to multiple intermediaries
+  - Second phase: Intermediaries gather to final destination
+  - Creates layering through multiple parallel paths.
+- STACK / LAYERING: Sequential hops through multiple accounts (linear chain). The purpose is typically to obscure the \
+  origin through distance/complexity.
+- CYCLE: Funds return to their origin point, creating a circular flow.
+- BIPARTITE: Structured flows between two distinct, segregated groups with no within-group transactions. The segregation \
+  and lack of within-group transactions is the defining characteristic. It's not just two-way flows, it's structured \
+  isolation between groups.
+- RANDOM: Complex pattern with no discernible structure. Use only when activity is clearly suspicious but doesn't fit \
+  other typologies.
+- NONE: No laundering pattern is supported by evidence in the investigation window.
 
 ## Output Format
 Return a single JSON object matching the configured output schema exactly. Populate every field.
 Use `pattern_type = "NONE"` when no laundering pattern is supported by evidence in the investigation window.
 
+**Rules for flagging transactions IDs**:
+- **Causal Chain Only**: Include *only* the transactions that form the identified laundering pattern.
+- **Exclude Noise**: If an account has more transactions but only 3 are part of the laundering chain, output *only* those 3 IDs.
+- When flagging transaction IDs, the seed transaction should be the last transaction in the chain (i.e., the most recent transaction), \
+  since the investigation window ends with the seed transaction.
+
 ## Handling Uncertainty
 If you lack sufficient information to make a determination, explicitly state what data is missing. \
 Do not fabricate transaction details or make unsupported inferences. When uncertain between benign and suspicious, \
@@ -110,6 +151,7 @@ def create_aml_investigation_agent(
     after_agent_callback: AfterAgentCallback | None = None,
     before_model_callback: BeforeModelCallback | None = None,
     after_model_callback: AfterModelCallback | None = None,
+    timeout_sec: int | None = None,
     enable_tracing: bool = True,
 ) -> LlmAgent:
     """Create a configured AML investigation agent.
@@ -155,6 +197,9 @@ def create_aml_investigation_agent(
         Callback executed before each model call.
     after_model_callback : AfterModelCallback | None, optional
         Callback executed after each model call.
+    timeout_sec : int | None, optional
+        Optional timeout in seconds for model calls. If specified, model calls
+        that exceed this duration will be cancelled.
     enable_tracing : bool, optional, default=True
         Whether to initialize Langfuse tracing for this agent. If ``True``, Langfuse
         tracing is initialized with the agent's name as the service name.
@@ -201,6 +246,7 @@ def create_aml_investigation_agent(
         instruction=instructions or ANALYST_PROMPT,
         tools=[FunctionTool(db.get_schema_info), FunctionTool(db.execute)],
         generate_content_config=GenerateContentConfig(
+            http_options=HttpOptions(timeout=timeout_sec * 1000) if timeout_sec is not None else None,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
diff --git a/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py b/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py
index 0ba833c..3572f8e 100644
--- a/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py
+++ b/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py
@@ -99,7 +99,7 @@ class AnalystOutput(BaseModel):
     pattern_type: LaunderingPattern = Field(..., description="The type of laundering pattern in the case.")
     pattern_description: str = Field(..., description="A short description of the laundering pattern.")
     flagged_transaction_ids: str = Field(
-        ..., description="A comma-separated list of transaction IDs flagged by the analyst as suspicious."
+        ..., description="A string of comma-separated transaction IDs that are make up the laundering pattern."
     )
 
 
diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/types.py b/aieng-eval-agents/aieng/agent_evals/evaluation/types.py
index c81a714..5dba29a 100644
--- a/aieng-eval-agents/aieng/agent_evals/evaluation/types.py
+++ b/aieng-eval-agents/aieng/agent_evals/evaluation/types.py
@@ -137,15 +137,12 @@ class TraceEvalResult:
         Trace IDs that failed due to errors during evaluation.
     errors_by_trace_id : dict[str, str]
         Error messages associated with skipped or failed traces.
-    run_evaluations : list[Evaluation]
-        Aggregated trace evaluation metrics written at dataset-run level.
     """
 
     evaluations_by_trace_id: dict[str, list[Evaluation]] = field(default_factory=dict)
     skipped_trace_ids: list[str] = field(default_factory=list)
     failed_trace_ids: list[str] = field(default_factory=list)
     errors_by_trace_id: dict[str, str] = field(default_factory=dict)
-    run_evaluations: list[Evaluation] = field(default_factory=list)
 
 
 @dataclass(frozen=True)
diff --git a/implementations/aml_investigation/evaluate.py b/implementations/aml_investigation/evaluate.py
new file mode 100644
index 0000000..dbb102b
--- /dev/null
+++ b/implementations/aml_investigation/evaluate.py
@@ -0,0 +1,204 @@
+"""Evaluate the AML investigation agent.
+
+This script uploads the AML investigation dataset to Langfuse, runs the evaluation
+experiment with item-level and trace-level evaluators, and displays the results
+in the console. The evaluation includes deterministic grading based on known ground
+truth, as well as LLM-based assessments of narrative quality and trace groundedness.
+
+Example
+-------
+$ uv run --env-file .env implementations/aml_investigation/evaluate.py \
+    --dataset-path implementations/aml_investigation/data/aml_cases.jsonl \
+    --dataset-name AML-investigation
+"""
+
+import asyncio
+import logging
+from functools import partial
+
+import click
+from aieng.agent_evals.aml_investigation.agent import create_aml_investigation_agent
+from aieng.agent_evals.aml_investigation.graders import (
+    item_level_deterministic_grader,
+    run_level_grader,
+    trace_deterministic_grader,
+)
+from aieng.agent_evals.aml_investigation.task import AmlInvestigationTask
+from aieng.agent_evals.db_manager import DbManager
+from aieng.agent_evals.display import create_console, display_info, display_metrics_table
+from aieng.agent_evals.evaluation import TraceWaitConfig
+from aieng.agent_evals.evaluation.experiment import run_experiment_with_trace_evals
+from aieng.agent_evals.evaluation.graders import (
+    create_llm_as_judge_evaluator,
+    create_trace_groundedness_evaluator,
+)
+from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig
+from aieng.agent_evals.langfuse import upload_dataset_to_langfuse
+from rich.logging import RichHandler
+
+
+logging.basicConfig(level=logging.INFO, format="%(message)s", handlers=[RichHandler(show_path=False)], force=True)
+
+# Silence verbose INFO logs from Google ADK
+logging.getLogger("google_adk").setLevel(logging.WARNING)
+
+logger = logging.getLogger(__name__)
+
+
+@click.command()
+@click.option(
+    "--dataset-path",
+    type=click.Path(exists=True, dir_okay=False, readable=True),
+    required=True,
+    help="Path to the dataset JSONL file.",
+)
+@click.option("--dataset-name", type=str, required=True, help="Name of the dataset to upload to Langfuse.")
+@click.option(
+    "--agent-timeout",
+    type=click.IntRange(min=1, max_open=True),
+    default=300,
+    help="Timeout in seconds for the AML investigation agent.",
+)
+@click.option(
+    "--llm-judge-timeout",
+    type=click.IntRange(min=1, max_open=True),
+    default=120,
+    help="Timeout in seconds for LLM judge evaluations.",
+)
+@click.option(
+    "--llm-judge-retries",
+    type=click.IntRange(min=0, max_open=True),
+    default=3,
+    help="Number of retry attempts for LLM judge evaluations in case of failures.",
+)
+@click.option(
+    "--max-concurrent-cases",
+    type=click.IntRange(min=1, max=10),
+    default=5,
+    help="Maximum number of concurrent cases to process during evaluation.",
+)
+@click.option(
+    "--max-concurrent-traces",
+    type=click.IntRange(min=1, max=10),
+    default=10,
+    help="Maximum number of concurrent traces to process during evaluation.",
+)
+@click.option(
+    "--max-trace-wait-time",
+    type=click.IntRange(min=1, max_open=True),
+    default=300,
+    help="Maximum time in seconds to wait for trace data to be ready during evaluation.",
+)
+def cli(
+    dataset_path: str,
+    dataset_name: str,
+    llm_judge_timeout: int,
+    llm_judge_retries: int,
+    agent_timeout: int,
+    max_concurrent_cases: int,
+    max_concurrent_traces: int,
+    max_trace_wait_time: int,
+) -> None:
+    """Evaluate AML Investigation agent on a given dataset.
+
+    Parameters
+    ----------
+    dataset_path : str
+        Path to the dataset JSONL file containing AML cases.
+    dataset_name : str
+        Name of the dataset to upload to Langfuse for evaluation.
+    llm_judge_timeout : int
+        Timeout in seconds for LLM-based judge evaluations.
+    llm_judge_retries : int
+        Number of retry attempts for LLM judge evaluations in case of failures.
+    agent_timeout : int
+        Timeout in seconds for the AML investigation agent to complete each case.
+    max_concurrent_cases : int
+        Maximum number of concurrent cases to process during evaluation.
+    max_concurrent_traces : int
+        Maximum number of concurrent traces to process during evaluation.
+    max_trace_wait_time : int
+        Maximum time in seconds to wait for trace data to be ready during evaluation.
+    """
+    # Create console for rich formatted output
+    console = create_console(force_jupyter=False)
+
+    # Upload dataset to Langfuse
+    asyncio.run(upload_dataset_to_langfuse(dataset_path, dataset_name))
+
+    # Define graders/evaluators
+    # Item-level LLM-as-a-judge evaluator assesses the quality of the agent's
+    # narrative output based on a rubric.
+    narrative_quality_evaluator = create_llm_as_judge_evaluator(
+        name="narrative_quality",
+        rubric_markdown="implementations/aml_investigation/rubrics/narrative_pattern_quality.md",
+        model_config=LLMRequestConfig(timeout_sec=llm_judge_timeout, retry_max_attempts=llm_judge_retries),
+    )
+
+    # Trace-level graders assess the correctness of tool use and the groundedness
+    # of the agent's response based on trace data.
+    db_policy = DbManager().aml_db().policy
+    deterministic_trace_grader = partial(trace_deterministic_grader, db_policy=db_policy)
+    trace_groundedness_evaluator = create_trace_groundedness_evaluator(
+        model_config=LLMRequestConfig(timeout_sec=llm_judge_timeout, retry_max_attempts=llm_judge_retries)
+    )
+
+    agent = create_aml_investigation_agent(timeout_sec=agent_timeout)
+    results = run_experiment_with_trace_evals(
+        dataset_name=dataset_name,
+        name="AML Investigation Evaluation",
+        task=AmlInvestigationTask(agent=agent),
+        evaluators=[item_level_deterministic_grader, narrative_quality_evaluator],
+        trace_evaluators=[deterministic_trace_grader, trace_groundedness_evaluator],
+        run_evaluators=[run_level_grader],
+        max_concurrency=max_concurrent_cases,
+        trace_max_concurrency=max_concurrent_traces,
+        trace_wait=TraceWaitConfig(max_wait_sec=max_trace_wait_time),
+    )
+
+    # Display item-level results
+    console.print("\n[bold cyan]📋 Item-Level Results[/bold cyan]\n")
+    for idx, item_result in enumerate(results.experiment.item_results, start=1):
+        item_metrics = {eval_.name: eval_.value for eval_ in item_result.evaluations}
+        # Try to get item ID from metadata, fall back to index
+        item_id = f"Item {idx}"
+        try:
+            item = item_result.item
+            if item and isinstance(item, dict):
+                metadata = item.get("metadata", {})
+                if metadata and isinstance(metadata, dict):
+                    item_id = metadata.get("id", item_id)
+            elif item and hasattr(item, "metadata"):
+                metadata = getattr(item, "metadata", None)
+                if metadata and isinstance(metadata, dict):
+                    item_id = metadata.get("id", item_id)
+        except Exception:
+            pass  # Keep default item_id
+
+        display_metrics_table(
+            metrics=item_metrics,
+            title=str(item_id),
+            console=console,
+        )
+
+    # Display run-level metrics
+    if hasattr(results.experiment, "run_evaluations") and results.experiment.run_evaluations:
+        console.print("\n[bold green]📊 Run-Level Metrics[/bold green]\n")
+        run_metrics = {eval_.name: eval_.value for eval_ in results.experiment.run_evaluations}
+        display_metrics_table(metrics=run_metrics, title="Aggregate Performance", console=console)
+
+    # Display trace evaluation summary
+    if results.trace_evaluations:
+        console.print("\n[bold magenta]🔍 Trace Evaluation Summary[/bold magenta]\n")
+        trace_summary: dict[str, float | int | str] = {
+            "Successful Traces": len(results.trace_evaluations.evaluations_by_trace_id),
+            "Skipped Traces": len(results.trace_evaluations.skipped_trace_ids),
+            "Failed Traces": len(results.trace_evaluations.failed_trace_ids),
+        }
+        display_metrics_table(metrics=trace_summary, title="Trace Processing", console=console)
+
+    display_info("Evaluation complete! Results have been uploaded to Langfuse.", console=console)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/implementations/aml_investigation/rubrics/narrative_pattern_quality.md b/implementations/aml_investigation/rubrics/narrative_pattern_quality.md
new file mode 100644
index 0000000..1074e3f
--- /dev/null
+++ b/implementations/aml_investigation/rubrics/narrative_pattern_quality.md
@@ -0,0 +1,29 @@
+This rubric scores investigation reasoning quality only. It does not score grammar, writing style, tone, or fluency.
+
+### Scoring Table
+
+| Score | `summary_narrative_quality` | `pattern_description_quality` |
+| --- | --- | --- |
+| 5 | Evidence-grounded and coherent investigation logic. Explicitly considers and rules out plausible benign explanations. Conclusion is fully consistent with cited evidence and final decision fields. | Clear mechanism-level typology description: flow shape, actor roles, fund movement pattern, and temporal logic are explicit and consistent with the conclusion. |
+| 4 | Strong evidence linkage and mostly coherent logic, with only minor omissions or weak spots. | Mechanism is mostly correct and specific, with minor incompleteness. |
+| 3 | Mixed quality. Some grounding and reasoning are present, but analysis is partially generic, incomplete, or weakly connected to evidence. | Partially correct mechanism but vague, generic, or only partially connected to evidence. |
+| 2 | Weak evidence grounding with major reasoning gaps, leaps, or unsupported inferences. | Mostly vague description and/or materially incomplete with partial inaccuracies. |
+| 1 | Reasoning is unsupported, contradictory, or materially inconsistent with available evidence. | Incorrect, contradictory, or effectively empty mechanism description. |
+
+### Hard Guardrails (Reasoning Quality With Hard Floors)
+
+- If the narrative contains material unsupported claims: `summary_narrative_quality <= 2`.
+- If the narrative contradicts final decision fields (`is_laundering`, `pattern_type`): `summary_narrative_quality = 1`.
+- If the pattern description contradicts decision fields or typology semantics: `pattern_description_quality = 1`.
+- If the pattern description is effectively placeholder or non-informative: `pattern_description_quality <= 2`.
+
+### Scoring Instructions
+
+- Use integers only: `1`, `2`, `3`, `4`, `5`.
+- Judge only from the provided input, expected output, and candidate output.
+- Keep comments concise and evidence-focused.
+
+### Special Cases
+
+- If ground-truth `pattern_description` is missing, `N/A`, or equivalent placeholder text, treat any coherent candidate pattern description as valid when it is consistent with other fields (especially `is_laundering` and `pattern_type`).
+- Ground-truth `pattern_description` may be terse typology shorthand (for example, `Max 1-degree Fan-In, Max 10-degree Fan-Out, Max 7 hops`). In these cases, evaluate semantic consistency with typology mechanics rather than exact phrasing.

From be2858b675557ee0f1494e2cdec8292d354b0ff1 Mon Sep 17 00:00:00 2001
From: fcogidi <41602287+fcogidi@users.noreply.github.com>
Date: Wed, 18 Feb 2026 12:58:35 -0500
Subject: [PATCH 2/3] Update README to include documentation on evaluation
 script

---
 implementations/aml_investigation/README.md | 50 +++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/implementations/aml_investigation/README.md b/implementations/aml_investigation/README.md
index 3f2c9df..31f852f 100644
--- a/implementations/aml_investigation/README.md
+++ b/implementations/aml_investigation/README.md
@@ -81,6 +81,56 @@ uv run --env-file .env implementations/aml_investigation/cli.py
 
 The script prints a simple confusion matrix for `is_laundering` based on the cases that have `output`.
 
+## Evaluate the Agent
+
+The evaluation script uploads the AML investigation dataset to Langfuse, runs a comprehensive evaluation experiment with multiple types of evaluators, and displays results in the console.
+
+```bash
+uv run --env-file .env implementations/aml_investigation/evaluate.py \
+  --dataset-path implementations/aml_investigation/data/aml_cases.jsonl \
+  --dataset-name AML-investigation
+```
+
+### Evaluation Levels
+
+The evaluation framework assesses agent performance at three levels:
+
+**Item-Level Evaluators** — Score each individual case prediction:
+
+- **Deterministic grader**: Checks correctness of `is_laundering`, `pattern_type`, and flagged transaction IDs against ground truth
+- **Narrative quality evaluator**: LLM-as-judge that scores the investigation reasoning and pattern description quality using the rubric in `rubrics/narrative_pattern_quality.md`
+
+**Trace-Level Evaluators** — Analyze tool-use trajectories for each agent run:
+
+- **Trace deterministic grader**: Validates SQL safety (read-only compliance), time window adherence, and query redundancy metrics
+- **Trace groundedness evaluator**: LLM-based assessment of whether the agent's narrative is grounded in the actual tool outputs
+
+**Run-Level Grader** — Aggregates results across all cases:
+
+- Computes precision, recall, and F1-score for `is_laundering` detection
+- Generates macro F1-score and confusion matrix for `pattern_type` classification
+
+### CLI Options
+
+Key options you may want to adjust:
+
+- `--agent-timeout`: Timeout in seconds for each agent run (default: 300)
+- `--llm-judge-timeout`: Timeout for LLM judge evaluations (default: 120)
+- `--llm-judge-retries`: Retry attempts for LLM judge failures (default: 3)
+- `--max-concurrent-cases`: Maximum concurrent cases to process (default: 5)
+- `--max-concurrent-traces`: Maximum concurrent trace evaluations (default: 10)
+- `--max-trace-wait-time`: Maximum seconds to wait for trace data (default: 300)
+
+### Output
+
+The evaluation displays:
+
+1. **Per-item metrics tables**: Shows deterministic and narrative quality scores for each case
+2. **Run-level aggregate metrics**: Overall precision, recall, F1-score, and confusion matrix
+3. **Trace evaluation summary**: Count of successful, skipped, and failed trace evaluations
+
+All results are uploaded to Langfuse for further analysis and visualization.
+
 ## Run with ADK Web UI
 
 If you want to inspect the agent interactively, the module exposes a top-level `root_agent` for ADK discovery.

From 48fb98a6f3eadb610946ca064d298fd3c06fa784 Mon Sep 17 00:00:00 2001
From: fcogidi <41602287+fcogidi@users.noreply.github.com>
Date: Wed, 18 Feb 2026 12:58:45 -0500
Subject: [PATCH 3/3] Fix typo in AnalystOutput model description for
 flagged_transaction_ids field

---
 .../aieng/agent_evals/aml_investigation/data/cases.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py b/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py
index 3572f8e..a25810c 100644
--- a/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py
+++ b/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py
@@ -99,7 +99,7 @@ class AnalystOutput(BaseModel):
     pattern_type: LaunderingPattern = Field(..., description="The type of laundering pattern in the case.")
     pattern_description: str = Field(..., description="A short description of the laundering pattern.")
     flagged_transaction_ids: str = Field(
-        ..., description="A string of comma-separated transaction IDs that are make up the laundering pattern."
+        ..., description="A string of comma-separated transaction IDs that make up the laundering pattern."
     )