From 054a95b75b690bcf3e84a3d3f3d3b9e28e1ac403 Mon Sep 17 00:00:00 2001 From: fcogidi <41602287+fcogidi@users.noreply.github.com> Date: Wed, 18 Feb 2026 12:29:29 -0500 Subject: [PATCH 1/3] Add AML agent evaluation script --- .../agent_evals/aml_investigation/agent.py | 100 ++++++--- .../aml_investigation/data/cases.py | 2 +- .../aieng/agent_evals/evaluation/types.py | 3 - implementations/aml_investigation/evaluate.py | 204 ++++++++++++++++++ .../rubrics/narrative_pattern_quality.md | 29 +++ 5 files changed, 307 insertions(+), 31 deletions(-) create mode 100644 implementations/aml_investigation/evaluate.py create mode 100644 implementations/aml_investigation/rubrics/narrative_pattern_quality.md diff --git a/aieng-eval-agents/aieng/agent_evals/aml_investigation/agent.py b/aieng-eval-agents/aieng/agent_evals/aml_investigation/agent.py index 77320c1..c099d10 100644 --- a/aieng-eval-agents/aieng/agent_evals/aml_investigation/agent.py +++ b/aieng-eval-agents/aieng/agent_evals/aml_investigation/agent.py @@ -24,20 +24,23 @@ from google.adk.agents.base_agent import AfterAgentCallback, BeforeAgentCallback from google.adk.agents.llm_agent import AfterModelCallback, BeforeModelCallback from google.adk.tools.function_tool import FunctionTool -from google.genai.types import GenerateContentConfig, ThinkingConfig +from google.genai.types import GenerateContentConfig, HttpOptions, ThinkingConfig _DEFAULT_AGENT_DESCRIPTION = "Conducts multi-step investigations for money laundering patterns using database queries." ANALYST_PROMPT = """\ You are an Anti‑Money Laundering (AML) Investigation Analyst at a financial institution. \ -Your job is to investigate one case by reviewing activity in the available database and explaining whether the \ -observed behavior within the case window is consistent with money laundering or a benign explanation. +Your job is to investigate a case by reviewing activity in the available database and explaining whether the \ +observed behavior within the case window is consistent with a money laundering pattern or a benign explanation. -You have access to database query tools. Use them. Do not guess or invent transactions. +You have access to tools for querying the database. Use them strategically. Do NOT guess or invent transactions. -## Core Principle: Falsification -Start with the hypothesis that the case is benign. Prefer legitimate explanations unless the transaction-level evidence supports laundering. +## Core Principles +- Start with the hypothesis that activity is legitimate/benign unless evidence contradicts this. +- Laundering requires multiple indicators from different categories, not single factors alone. +- Entity type, business model, and transaction purpose determine whether patterns are suspicious. +- Base conclusions on observable transaction patterns, not speculation or absence of information. ## Input You will be given a JSON object with these fields: @@ -45,48 +48,86 @@ - `seed_transaction_id`: identifier for the primary transaction that triggered the case. - `seed_timestamp`: timestamp of the seed transaction (end of the investigation window). - `window_start`: timestamp of the beginning of the investigation window. -- `trigger_label`: upstream alert/review label or heuristic hint (may be wrong). +- `trigger_label`: upstream alert or review label that initiated the case. This may be noisy and should not be taken \ + at face value. -### Time Scope Constraint -**Critical**: Only analyze events with timestamps between `window_start` and `seed_timestamp` (inclusive). Exclude any events after `seed_timestamp`. +**Time Scope**: Only analyze events with timestamps between `window_start` and `seed_timestamp` (inclusive). ## Investigation Workflow -### Step 1: Orient -Review the `trigger_label` as context only. Do not assume it is correct. +### Step 1: Seed Transaction Review +Query the seed transaction and extract: +- Involved parties and their entity types (Corporation, Sole Proprietorship, Partnership, Individual) +- Amounts, currencies, payment channels +- Timestamps and jurisdictions -### Step 2: Seed Transaction Review -- Query the seed transaction using `seed_transaction_id` -- Extract: involved parties, amounts, payment channels, instruments, and other relevant attributes +### Step 2: Scope and Collect +**Note**: You have limited context window and limited number of queries to the database. Be strategic with the queries \ +you run to avoid hitting limits before gathering enough evidence to make a determination. -### Step 3: Scope and Collect -Pull related activity for involved entities within the investigation window (`window_start` to `seed_timestamp`, inclusive). +**For each account you investigate**: -### Step 4: Assess Benign Explanations (Default Hypothesis) +1. **Always start with aggregates**: + ``` + - COUNT(*) transactions + - COUNT(DISTINCT counterparty) + - SUM(amount) by direction + - Distribution by payment type/time + ``` + +2. **Pull details selectively**: + - If count ≀ 20 transactions: Safe to SELECT all + - If count > 20: Query top counterparties, then pull samples for suspicious patterns + - Never pull thousands of raw transactions - use aggregates + samples + +3. **Expand strategically**: + - Follow promising leads from aggregates (unusual counterparties, timing clusters) + - Maximum 2-3 hops from seed unless clear layering chain + +### Step 3: Assess Benign Explanations (Default Hypothesis) Attempt to explain observed activity as legitimate first: - State which evidence supports the benign hypothesis - Identify what additional data would strengthen this explanation -- Only proceed to Step 5 if benign explanations are insufficient +- Only proceed to Step 4 if benign explanations are insufficient -### Step 5: Test Laundering Hypotheses (If Needed) +### Step 4: Test Laundering Hypotheses (If Needed) If benign explanations fail to account for the evidence: - Test whether the evidence supports known laundering typologies - Cite concrete indicators that rule out benign explanations ## Typologies / Heuristics -When assessing patterns, consider these typologies: -- FAN-IN (aggregation): Many sources aggregating to one destination -- FAN-OUT (dispersion): One source dispersing to many destinations -- GATHER-SCATTER / SCATTER-GATHER: Aggregation followed by dispersion (or reverse) over short time windows -- STACK / LAYERING: Multiple hops meant to obscure origin -- CYCLE: Circular fund movement -- BIPARTITE: Structured flows between two distinct groups -- RANDOM: Complex pattern with no discernible structure +Consider the following typologies when assessing laundering patterns: + +- FAN-IN: *Many* distinct source accounts -> *One* destination account (consolidation/aggregation) +- FAN-OUT: *One* source account -> *Many* distinct destination accounts (distribution/dispersion) +- GATHER-SCATTER: *Many* sources -> *One* hub -> *Many* destinations (in that temporal order) + - First phase: Hub gathers from multiple sources + - Second phase: Hub scatters to multiple destinations + - Time gap between phases: typically hours to days. +- SCATTER-GATHER: *One* source -> *Many* intermediaries -> *One* destination (in that temporal order) + - First phase: Source scatters to multiple intermediaries + - Second phase: Intermediaries gather to final destination + - Creates layering through multiple parallel paths. +- STACK / LAYERING: Sequential hops through multiple accounts (linear chain). The purpose is typically to obscure the \ + origin through distance/complexity. +- CYCLE: Funds return to their origin point, creating a circular flow. +- BIPARTITE: Structured flows between two distinct, segregated groups with no within-group transactions. The segregation \ + and lack of within-group transactions is the defining characteristic. It's not just two-way flows, it's structured \ + isolation between groups. +- RANDOM: Complex pattern with no discernible structure. Use only when activity is clearly suspicious but doesn't fit \ + other typologies. +- NONE: No laundering pattern is supported by evidence in the investigation window. ## Output Format Return a single JSON object matching the configured output schema exactly. Populate every field. Use `pattern_type = "NONE"` when no laundering pattern is supported by evidence in the investigation window. +**Rules for flagging transactions IDs**: +- **Causal Chain Only**: Include *only* the transactions that form the identified laundering pattern. +- **Exclude Noise**: If an account has more transactions but only 3 are part of the laundering chain, output *only* those 3 IDs. +- When flagging transaction IDs, the seed transaction should be the last transaction in the chain (i.e., the most recent transaction), \ + since the investigation window ends with the seed transaction. + ## Handling Uncertainty If you lack sufficient information to make a determination, explicitly state what data is missing. \ Do not fabricate transaction details or make unsupported inferences. When uncertain between benign and suspicious, \ @@ -110,6 +151,7 @@ def create_aml_investigation_agent( after_agent_callback: AfterAgentCallback | None = None, before_model_callback: BeforeModelCallback | None = None, after_model_callback: AfterModelCallback | None = None, + timeout_sec: int | None = None, enable_tracing: bool = True, ) -> LlmAgent: """Create a configured AML investigation agent. @@ -155,6 +197,9 @@ def create_aml_investigation_agent( Callback executed before each model call. after_model_callback : AfterModelCallback | None, optional Callback executed after each model call. + timeout_sec : int | None, optional + Optional timeout in seconds for model calls. If specified, model calls + that exceed this duration will be cancelled. enable_tracing : bool, optional, default=True Whether to initialize Langfuse tracing for this agent. If ``True``, Langfuse tracing is initialized with the agent's name as the service name. @@ -201,6 +246,7 @@ def create_aml_investigation_agent( instruction=instructions or ANALYST_PROMPT, tools=[FunctionTool(db.get_schema_info), FunctionTool(db.execute)], generate_content_config=GenerateContentConfig( + http_options=HttpOptions(timeout=timeout_sec * 1000) if timeout_sec is not None else None, temperature=temperature, top_p=top_p, top_k=top_k, diff --git a/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py b/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py index 0ba833c..3572f8e 100644 --- a/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py +++ b/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py @@ -99,7 +99,7 @@ class AnalystOutput(BaseModel): pattern_type: LaunderingPattern = Field(..., description="The type of laundering pattern in the case.") pattern_description: str = Field(..., description="A short description of the laundering pattern.") flagged_transaction_ids: str = Field( - ..., description="A comma-separated list of transaction IDs flagged by the analyst as suspicious." + ..., description="A string of comma-separated transaction IDs that are make up the laundering pattern." ) diff --git a/aieng-eval-agents/aieng/agent_evals/evaluation/types.py b/aieng-eval-agents/aieng/agent_evals/evaluation/types.py index c81a714..5dba29a 100644 --- a/aieng-eval-agents/aieng/agent_evals/evaluation/types.py +++ b/aieng-eval-agents/aieng/agent_evals/evaluation/types.py @@ -137,15 +137,12 @@ class TraceEvalResult: Trace IDs that failed due to errors during evaluation. errors_by_trace_id : dict[str, str] Error messages associated with skipped or failed traces. - run_evaluations : list[Evaluation] - Aggregated trace evaluation metrics written at dataset-run level. """ evaluations_by_trace_id: dict[str, list[Evaluation]] = field(default_factory=dict) skipped_trace_ids: list[str] = field(default_factory=list) failed_trace_ids: list[str] = field(default_factory=list) errors_by_trace_id: dict[str, str] = field(default_factory=dict) - run_evaluations: list[Evaluation] = field(default_factory=list) @dataclass(frozen=True) diff --git a/implementations/aml_investigation/evaluate.py b/implementations/aml_investigation/evaluate.py new file mode 100644 index 0000000..dbb102b --- /dev/null +++ b/implementations/aml_investigation/evaluate.py @@ -0,0 +1,204 @@ +"""Evaluate the AML investigation agent. + +This script uploads the AML investigation dataset to Langfuse, runs the evaluation +experiment with item-level and trace-level evaluators, and displays the results +in the console. The evaluation includes deterministic grading based on known ground +truth, as well as LLM-based assessments of narrative quality and trace groundedness. + +Example +------- +$ uv run --env-file .env implementations/aml_investigation/evaluate.py \ + --dataset-path implementations/aml_investigation/data/aml_cases.jsonl \ + --dataset-name AML-investigation +""" + +import asyncio +import logging +from functools import partial + +import click +from aieng.agent_evals.aml_investigation.agent import create_aml_investigation_agent +from aieng.agent_evals.aml_investigation.graders import ( + item_level_deterministic_grader, + run_level_grader, + trace_deterministic_grader, +) +from aieng.agent_evals.aml_investigation.task import AmlInvestigationTask +from aieng.agent_evals.db_manager import DbManager +from aieng.agent_evals.display import create_console, display_info, display_metrics_table +from aieng.agent_evals.evaluation import TraceWaitConfig +from aieng.agent_evals.evaluation.experiment import run_experiment_with_trace_evals +from aieng.agent_evals.evaluation.graders import ( + create_llm_as_judge_evaluator, + create_trace_groundedness_evaluator, +) +from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig +from aieng.agent_evals.langfuse import upload_dataset_to_langfuse +from rich.logging import RichHandler + + +logging.basicConfig(level=logging.INFO, format="%(message)s", handlers=[RichHandler(show_path=False)], force=True) + +# Silence verbose INFO logs from Google ADK +logging.getLogger("google_adk").setLevel(logging.WARNING) + +logger = logging.getLogger(__name__) + + +@click.command() +@click.option( + "--dataset-path", + type=click.Path(exists=True, dir_okay=False, readable=True), + required=True, + help="Path to the dataset JSONL file.", +) +@click.option("--dataset-name", type=str, required=True, help="Name of the dataset to upload to Langfuse.") +@click.option( + "--agent-timeout", + type=click.IntRange(min=1, max_open=True), + default=300, + help="Timeout in seconds for the AML investigation agent.", +) +@click.option( + "--llm-judge-timeout", + type=click.IntRange(min=1, max_open=True), + default=120, + help="Timeout in seconds for LLM judge evaluations.", +) +@click.option( + "--llm-judge-retries", + type=click.IntRange(min=0, max_open=True), + default=3, + help="Number of retry attempts for LLM judge evaluations in case of failures.", +) +@click.option( + "--max-concurrent-cases", + type=click.IntRange(min=1, max=10), + default=5, + help="Maximum number of concurrent cases to process during evaluation.", +) +@click.option( + "--max-concurrent-traces", + type=click.IntRange(min=1, max=10), + default=10, + help="Maximum number of concurrent traces to process during evaluation.", +) +@click.option( + "--max-trace-wait-time", + type=click.IntRange(min=1, max_open=True), + default=300, + help="Maximum time in seconds to wait for trace data to be ready during evaluation.", +) +def cli( + dataset_path: str, + dataset_name: str, + llm_judge_timeout: int, + llm_judge_retries: int, + agent_timeout: int, + max_concurrent_cases: int, + max_concurrent_traces: int, + max_trace_wait_time: int, +) -> None: + """Evaluate AML Investigation agent on a given dataset. + + Parameters + ---------- + dataset_path : str + Path to the dataset JSONL file containing AML cases. + dataset_name : str + Name of the dataset to upload to Langfuse for evaluation. + llm_judge_timeout : int + Timeout in seconds for LLM-based judge evaluations. + llm_judge_retries : int + Number of retry attempts for LLM judge evaluations in case of failures. + agent_timeout : int + Timeout in seconds for the AML investigation agent to complete each case. + max_concurrent_cases : int + Maximum number of concurrent cases to process during evaluation. + max_concurrent_traces : int + Maximum number of concurrent traces to process during evaluation. + max_trace_wait_time : int + Maximum time in seconds to wait for trace data to be ready during evaluation. + """ + # Create console for rich formatted output + console = create_console(force_jupyter=False) + + # Upload dataset to Langfuse + asyncio.run(upload_dataset_to_langfuse(dataset_path, dataset_name)) + + # Define graders/evaluators + # Item-level LLM-as-a-judge evaluator assesses the quality of the agent's + # narrative output based on a rubric. + narrative_quality_evaluator = create_llm_as_judge_evaluator( + name="narrative_quality", + rubric_markdown="implementations/aml_investigation/rubrics/narrative_pattern_quality.md", + model_config=LLMRequestConfig(timeout_sec=llm_judge_timeout, retry_max_attempts=llm_judge_retries), + ) + + # Trace-level graders assess the correctness of tool use and the groundedness + # of the agent's response based on trace data. + db_policy = DbManager().aml_db().policy + deterministic_trace_grader = partial(trace_deterministic_grader, db_policy=db_policy) + trace_groundedness_evaluator = create_trace_groundedness_evaluator( + model_config=LLMRequestConfig(timeout_sec=llm_judge_timeout, retry_max_attempts=llm_judge_retries) + ) + + agent = create_aml_investigation_agent(timeout_sec=agent_timeout) + results = run_experiment_with_trace_evals( + dataset_name=dataset_name, + name="AML Investigation Evaluation", + task=AmlInvestigationTask(agent=agent), + evaluators=[item_level_deterministic_grader, narrative_quality_evaluator], + trace_evaluators=[deterministic_trace_grader, trace_groundedness_evaluator], + run_evaluators=[run_level_grader], + max_concurrency=max_concurrent_cases, + trace_max_concurrency=max_concurrent_traces, + trace_wait=TraceWaitConfig(max_wait_sec=max_trace_wait_time), + ) + + # Display item-level results + console.print("\n[bold cyan]πŸ“‹ Item-Level Results[/bold cyan]\n") + for idx, item_result in enumerate(results.experiment.item_results, start=1): + item_metrics = {eval_.name: eval_.value for eval_ in item_result.evaluations} + # Try to get item ID from metadata, fall back to index + item_id = f"Item {idx}" + try: + item = item_result.item + if item and isinstance(item, dict): + metadata = item.get("metadata", {}) + if metadata and isinstance(metadata, dict): + item_id = metadata.get("id", item_id) + elif item and hasattr(item, "metadata"): + metadata = getattr(item, "metadata", None) + if metadata and isinstance(metadata, dict): + item_id = metadata.get("id", item_id) + except Exception: + pass # Keep default item_id + + display_metrics_table( + metrics=item_metrics, + title=str(item_id), + console=console, + ) + + # Display run-level metrics + if hasattr(results.experiment, "run_evaluations") and results.experiment.run_evaluations: + console.print("\n[bold green]πŸ“Š Run-Level Metrics[/bold green]\n") + run_metrics = {eval_.name: eval_.value for eval_ in results.experiment.run_evaluations} + display_metrics_table(metrics=run_metrics, title="Aggregate Performance", console=console) + + # Display trace evaluation summary + if results.trace_evaluations: + console.print("\n[bold magenta]πŸ” Trace Evaluation Summary[/bold magenta]\n") + trace_summary: dict[str, float | int | str] = { + "Successful Traces": len(results.trace_evaluations.evaluations_by_trace_id), + "Skipped Traces": len(results.trace_evaluations.skipped_trace_ids), + "Failed Traces": len(results.trace_evaluations.failed_trace_ids), + } + display_metrics_table(metrics=trace_summary, title="Trace Processing", console=console) + + display_info("Evaluation complete! Results have been uploaded to Langfuse.", console=console) + + +if __name__ == "__main__": + cli() diff --git a/implementations/aml_investigation/rubrics/narrative_pattern_quality.md b/implementations/aml_investigation/rubrics/narrative_pattern_quality.md new file mode 100644 index 0000000..1074e3f --- /dev/null +++ b/implementations/aml_investigation/rubrics/narrative_pattern_quality.md @@ -0,0 +1,29 @@ +This rubric scores investigation reasoning quality only. It does not score grammar, writing style, tone, or fluency. + +### Scoring Table + +| Score | `summary_narrative_quality` | `pattern_description_quality` | +| --- | --- | --- | +| 5 | Evidence-grounded and coherent investigation logic. Explicitly considers and rules out plausible benign explanations. Conclusion is fully consistent with cited evidence and final decision fields. | Clear mechanism-level typology description: flow shape, actor roles, fund movement pattern, and temporal logic are explicit and consistent with the conclusion. | +| 4 | Strong evidence linkage and mostly coherent logic, with only minor omissions or weak spots. | Mechanism is mostly correct and specific, with minor incompleteness. | +| 3 | Mixed quality. Some grounding and reasoning are present, but analysis is partially generic, incomplete, or weakly connected to evidence. | Partially correct mechanism but vague, generic, or only partially connected to evidence. | +| 2 | Weak evidence grounding with major reasoning gaps, leaps, or unsupported inferences. | Mostly vague description and/or materially incomplete with partial inaccuracies. | +| 1 | Reasoning is unsupported, contradictory, or materially inconsistent with available evidence. | Incorrect, contradictory, or effectively empty mechanism description. | + +### Hard Guardrails (Reasoning Quality With Hard Floors) + +- If the narrative contains material unsupported claims: `summary_narrative_quality <= 2`. +- If the narrative contradicts final decision fields (`is_laundering`, `pattern_type`): `summary_narrative_quality = 1`. +- If the pattern description contradicts decision fields or typology semantics: `pattern_description_quality = 1`. +- If the pattern description is effectively placeholder or non-informative: `pattern_description_quality <= 2`. + +### Scoring Instructions + +- Use integers only: `1`, `2`, `3`, `4`, `5`. +- Judge only from the provided input, expected output, and candidate output. +- Keep comments concise and evidence-focused. + +### Special Cases + +- If ground-truth `pattern_description` is missing, `N/A`, or equivalent placeholder text, treat any coherent candidate pattern description as valid when it is consistent with other fields (especially `is_laundering` and `pattern_type`). +- Ground-truth `pattern_description` may be terse typology shorthand (for example, `Max 1-degree Fan-In, Max 10-degree Fan-Out, Max 7 hops`). In these cases, evaluate semantic consistency with typology mechanics rather than exact phrasing. From be2858b675557ee0f1494e2cdec8292d354b0ff1 Mon Sep 17 00:00:00 2001 From: fcogidi <41602287+fcogidi@users.noreply.github.com> Date: Wed, 18 Feb 2026 12:58:35 -0500 Subject: [PATCH 2/3] Update README to include documentation on evaluation script --- implementations/aml_investigation/README.md | 50 +++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/implementations/aml_investigation/README.md b/implementations/aml_investigation/README.md index 3f2c9df..31f852f 100644 --- a/implementations/aml_investigation/README.md +++ b/implementations/aml_investigation/README.md @@ -81,6 +81,56 @@ uv run --env-file .env implementations/aml_investigation/cli.py The script prints a simple confusion matrix for `is_laundering` based on the cases that have `output`. +## Evaluate the Agent + +The evaluation script uploads the AML investigation dataset to Langfuse, runs a comprehensive evaluation experiment with multiple types of evaluators, and displays results in the console. + +```bash +uv run --env-file .env implementations/aml_investigation/evaluate.py \ + --dataset-path implementations/aml_investigation/data/aml_cases.jsonl \ + --dataset-name AML-investigation +``` + +### Evaluation Levels + +The evaluation framework assesses agent performance at three levels: + +**Item-Level Evaluators** β€” Score each individual case prediction: + +- **Deterministic grader**: Checks correctness of `is_laundering`, `pattern_type`, and flagged transaction IDs against ground truth +- **Narrative quality evaluator**: LLM-as-judge that scores the investigation reasoning and pattern description quality using the rubric in `rubrics/narrative_pattern_quality.md` + +**Trace-Level Evaluators** β€” Analyze tool-use trajectories for each agent run: + +- **Trace deterministic grader**: Validates SQL safety (read-only compliance), time window adherence, and query redundancy metrics +- **Trace groundedness evaluator**: LLM-based assessment of whether the agent's narrative is grounded in the actual tool outputs + +**Run-Level Grader** β€” Aggregates results across all cases: + +- Computes precision, recall, and F1-score for `is_laundering` detection +- Generates macro F1-score and confusion matrix for `pattern_type` classification + +### CLI Options + +Key options you may want to adjust: + +- `--agent-timeout`: Timeout in seconds for each agent run (default: 300) +- `--llm-judge-timeout`: Timeout for LLM judge evaluations (default: 120) +- `--llm-judge-retries`: Retry attempts for LLM judge failures (default: 3) +- `--max-concurrent-cases`: Maximum concurrent cases to process (default: 5) +- `--max-concurrent-traces`: Maximum concurrent trace evaluations (default: 10) +- `--max-trace-wait-time`: Maximum seconds to wait for trace data (default: 300) + +### Output + +The evaluation displays: + +1. **Per-item metrics tables**: Shows deterministic and narrative quality scores for each case +2. **Run-level aggregate metrics**: Overall precision, recall, F1-score, and confusion matrix +3. **Trace evaluation summary**: Count of successful, skipped, and failed trace evaluations + +All results are uploaded to Langfuse for further analysis and visualization. + ## Run with ADK Web UI If you want to inspect the agent interactively, the module exposes a top-level `root_agent` for ADK discovery. From 48fb98a6f3eadb610946ca064d298fd3c06fa784 Mon Sep 17 00:00:00 2001 From: fcogidi <41602287+fcogidi@users.noreply.github.com> Date: Wed, 18 Feb 2026 12:58:45 -0500 Subject: [PATCH 3/3] Fix typo in AnalystOutput model description for flagged_transaction_ids field --- .../aieng/agent_evals/aml_investigation/data/cases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py b/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py index 3572f8e..a25810c 100644 --- a/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py +++ b/aieng-eval-agents/aieng/agent_evals/aml_investigation/data/cases.py @@ -99,7 +99,7 @@ class AnalystOutput(BaseModel): pattern_type: LaunderingPattern = Field(..., description="The type of laundering pattern in the case.") pattern_description: str = Field(..., description="A short description of the laundering pattern.") flagged_transaction_ids: str = Field( - ..., description="A string of comma-separated transaction IDs that are make up the laundering pattern." + ..., description="A string of comma-separated transaction IDs that make up the laundering pattern." )