From cccbf83722dab348a0573f50cf1f353602e67376 Mon Sep 17 00:00:00 2001 From: fcogidi <41602287+fcogidi@users.noreply.github.com> Date: Fri, 20 Feb 2026 11:13:13 -0500 Subject: [PATCH 1/2] Add notebooks for AML investigation use case --- .../aml_investigation/01_data_and_tools.ipynb | 623 ++++++++++++++++++ .../02_running_the_agent.ipynb | 530 +++++++++++++++ .../aml_investigation/03_evaluation.ipynb | 429 ++++++++++++ 3 files changed, 1582 insertions(+) create mode 100644 implementations/aml_investigation/01_data_and_tools.ipynb create mode 100644 implementations/aml_investigation/02_running_the_agent.ipynb create mode 100644 implementations/aml_investigation/03_evaluation.ipynb diff --git a/implementations/aml_investigation/01_data_and_tools.ipynb b/implementations/aml_investigation/01_data_and_tools.ipynb new file mode 100644 index 0000000..b24b9c9 --- /dev/null +++ b/implementations/aml_investigation/01_data_and_tools.ipynb @@ -0,0 +1,623 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3ab9f1d4", + "metadata": {}, + "source": [ + "# Data & Tools Exploration\n", + "\n", + "This notebook walks you through the foundational layer of the AML investigation setup:\n", + "- What the database looks like and how it's built\n", + "- How to explore transactions and accounts manually\n", + "- What the `ReadOnlySqlDatabase` tool is and why it exists\n", + "- How an agent would \"see\" the database through that tool" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70f1ce5d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sqlite3\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "from aieng.agent_evals.aml_investigation.data import download_dataset_file, normalize_transactions_data\n", + "from aieng.agent_evals.tools.sql_database import ReadOnlySqlDatabase\n", + "from dotenv import load_dotenv\n", + "\n", + "\n", + "# Setting the notebook directory to the project's root folder\n", + "if Path(\"\").absolute().name == \"eval-agents\":\n", + " print(f\"Notebook path is already the root path: {Path('').absolute()}\")\n", + "else:\n", + " os.chdir(Path(\"\").absolute().parent.parent)\n", + " print(f\"The notebook path has been set to: {Path('').absolute()}\")\n", + "\n", + "load_dotenv(verbose=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e0ee671f", + "metadata": {}, + "source": [ + "## 1. Exploring the dataset\n", + "\n", + "We will be using the [IBM Transactions for Anti Money Laundering (AML)](https://www.kaggle.com/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml) dataset, which is available on Kaggle. It contains synthetic transaction data designed to mimic real-world financial transactions, including both legitimate and potentially fraudulent activities. The dataset includes various features such as transaction amount, type, origin and destination accounts, timestamps, and a label indicating whether the transaction is fraudulent or not." + ] + }, + { + "cell_type": "markdown", + "id": "e172158d", + "metadata": {}, + "source": [ + "### 1.1 Downloading the dataset\n", + "\n", + "There are 6 datasets available, divided into two groups of three sets. The groups are based on the ratio of illicit transactions in the data:\n", + "- Group **HI** contains relatively higher illicit transaction ratios (i.e. more laundering activity)\n", + "- Group **LI** contains relatively lower illicit transaction ratios (i.e. less laundering activity)\n", + "\n", + "Each group has three sets of data based on the total number of transactions/accounts: \"Small\", \"Medium\", and \"Large\".\n", + "\n", + "You can download any of the six datasets using the `download_dataset_file` function. However, **note that the code tries to load all the data into memory at once, so the \"Medium\" and \"Large\" datasets may cause memory issues on some machines**. For this reason, we recommend starting with the \"Small\" dataset.\n", + "\n", + "Each dataset has 3 files that you can download separately:\n", + "- `__Trans.csv`: contains transaction data, with each row representing a single transaction.\n", + "- `__accounts.csv`: contains account data, with each row representing a single account.\n", + "- `__Patterns.txt`: contains ground-truth laundering patterns, which are groups of transactions that are known to be part of the same laundering scheme. Each pattern includes a list of transaction IDs that are involved in that pattern." + ] + }, + { + "cell_type": "markdown", + "id": "89368db6", + "metadata": {}, + "source": [ + "#### 1.1.1. The transactions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a1d1be1", + "metadata": {}, + "outputs": [], + "source": [ + "path_to_transactions_csv = download_dataset_file(illicit_ratio=\"HI\", transactions_size=\"Small\", filename=\"Trans.csv\")\n", + "print(f\"Path to transactions.csv: {path_to_transactions_csv}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d0ce63d", + "metadata": {}, + "outputs": [], + "source": [ + "transactions_df = pd.read_csv(path_to_transactions_csv)\n", + "transactions_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57334721", + "metadata": {}, + "outputs": [], + "source": [ + "# Are there duplicates?\n", + "print(f\"Number of duplicate transactions: {transactions_df.duplicated().sum()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7ddea19c", + "metadata": {}, + "source": [ + "Notice that the transactions dataset needs some cleaning. For example:\n", + "- There are duplicate transactions that should be removed before analysis.\n", + "- There are two columns that have the same name \"Account\". Pandas automatically renamed the second one to \"Account.1\", but we should rename them to something more descriptive.\n", + "\n", + "We use the `normalize_transactions_data` function to perform these cleaning steps and make the transactions data easier to work with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73d17029", + "metadata": {}, + "outputs": [], + "source": [ + "transactions_df = normalize_transactions_data(transactions_df)\n", + "transactions_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "bd8b202f", + "metadata": {}, + "source": [ + "#### 1.1.2. The accounts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43c08be5", + "metadata": {}, + "outputs": [], + "source": [ + "path_to_accounts_csv = download_dataset_file(illicit_ratio=\"HI\", transactions_size=\"Small\", filename=\"accounts.csv\")\n", + "print(f\"Path to accounts.csv: {path_to_accounts_csv}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72b10fb6", + "metadata": {}, + "outputs": [], + "source": [ + "accounts_df = pd.read_csv(path_to_accounts_csv)\n", + "accounts_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "bddee34b", + "metadata": {}, + "source": [ + "Similar to the transactions dataset, we can rename the columns in the accounts dataset to make them easier to work with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da2eb793", + "metadata": {}, + "outputs": [], + "source": [ + "accounts_df.rename(\n", + " columns={\n", + " \"Bank Name\": \"bank_name\",\n", + " \"Bank ID\": \"bank_id\",\n", + " \"Account Number\": \"account_number\",\n", + " \"Entity ID\": \"entity_id\",\n", + " \"Entity Name\": \"entity_name\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "accounts_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "f78dff88", + "metadata": {}, + "source": [ + "#### 1.1.3. The patterns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e7bdeba", + "metadata": {}, + "outputs": [], + "source": [ + "path_to_patterns_txt = download_dataset_file(illicit_ratio=\"HI\", transactions_size=\"Small\", filename=\"Patterns.txt\")\n", + "print(f\"Path to patterns.txt: {path_to_patterns_txt}\")" + ] + }, + { + "cell_type": "markdown", + "id": "58af8a8d", + "metadata": {}, + "source": [ + "Laundering patterns start with `BEGIN LAUNDERING ATTEMPT` and end with `END LAUNDERING ATTEMPT`. Each pattern includes a list of transactions that are involved in that pattern." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8820b07d", + "metadata": {}, + "outputs": [], + "source": [ + "# Print the first laundering pattern\n", + "begin_prefix = \"BEGIN LAUNDERING ATTEMPT - \"\n", + "end_prefix = \"END LAUNDERING ATTEMPT\"\n", + "with open(path_to_patterns_txt, \"r\") as f:\n", + " for line in f:\n", + " if line.startswith(begin_prefix):\n", + " print(f\"\\n{line.strip()}\")\n", + " elif line.startswith(end_prefix):\n", + " print(f\"{line.strip()}\")\n", + " break\n", + " else:\n", + " print(line.strip())" + ] + }, + { + "cell_type": "markdown", + "id": "0227686a", + "metadata": {}, + "source": [ + "## 2. Build the Database\n", + "\n", + "With the datasets downloaded and cleaned, we can build a SQLite database that contains the transactions and accounts data. This will allow us to query the data using SQL, which is a common way for agents to interact with databases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c98ac3b", + "metadata": {}, + "outputs": [], + "source": [ + "DB_PATH = Path(\"implementations/aml_investigation/data/aml_transactions.db\")\n", + "DDL_PATH = Path(\"implementations/aml_investigation/data/schema.ddl\")\n", + "\n", + "print(f\"Database exists: {DB_PATH.exists()}\")\n", + "print(f\"DDL file exists: {DDL_PATH.exists()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec16217e", + "metadata": {}, + "outputs": [], + "source": [ + "# Run this cell only if the database doesn't exist yet.\n", + "# It will build the SQLite DB. It may take some time to run.\n", + "\n", + "if not DB_PATH.exists():\n", + " with sqlite3.connect(DB_PATH) as conn:\n", + " conn.execute(\"PRAGMA foreign_keys = ON;\")\n", + "\n", + " if not DDL_PATH.exists():\n", + " raise FileNotFoundError(f\"DDL file not found at {DDL_PATH}\")\n", + "\n", + " with open(DDL_PATH, \"r\") as file:\n", + " conn.executescript(file.read())\n", + " conn.commit()\n", + "\n", + " # Add new columns to the transactions DataFrame: date, day_of_week, time_of_day\n", + " transactions_df[\"date\"] = pd.to_datetime(transactions_df[\"timestamp\"]).dt.date\n", + " transactions_df[\"day_of_week\"] = pd.to_datetime(transactions_df[\"timestamp\"]).dt.day_name()\n", + " transactions_df[\"time_of_day\"] = pd.to_datetime(transactions_df[\"timestamp\"]).dt.time\n", + "\n", + " # Set Transaction ID as index\n", + " transactions_df.set_index(\"transaction_id\", drop=True, inplace=True)\n", + "\n", + " # NOTE: We drop the \"is_laundering\" column since that's the label the\n", + " # agent is trying to predict, and it wouldn't be present in a real\n", + " # investigation scenario.\n", + " transactions_df.drop(columns=[\"is_laundering\"], inplace=True)\n", + "\n", + " accounts_df.to_sql(\"accounts\", conn, if_exists=\"append\", index=False)\n", + " transactions_df.to_sql(\"transactions\", conn, if_exists=\"append\")\n", + "else:\n", + " print(\"Database already exists — skipping creation.\")" + ] + }, + { + "cell_type": "markdown", + "id": "a2c7068c", + "metadata": {}, + "source": [ + "### 2.1. Understand the Schema\n", + "\n", + "The database has two core tables and one convenience view:\n", + "\n", + "- **`accounts`** — who owns each account (bank, account number, entity name)\n", + "- **`transactions`** — every transfer between accounts (amount, currency, timestamp, payment format)\n", + "- **`account_transactions`** (view) — a flattened, account-centric view of transactions. Each transaction appears **twice**: once as an OUT row for the sender, once as an IN row for the receiver. This makes it easy to query all activity for a single account without a `UNION` every time.\n", + "\n", + "Let's look at the raw DDL:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08c5f581", + "metadata": {}, + "outputs": [], + "source": [ + "print(DDL_PATH.read_text())" + ] + }, + { + "cell_type": "markdown", + "id": "f20770ff", + "metadata": {}, + "source": [ + "## 3. Manual Exploration with `pandas` + `sqlite3`\n", + "\n", + "Let's get familiar with the data before involving any agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "045bdfde", + "metadata": {}, + "outputs": [], + "source": [ + "conn = sqlite3.connect(DB_PATH)\n", + "\n", + "# Quick sanity check: how many rows in each table?\n", + "for table in [\"accounts\", \"transactions\"]:\n", + " count = pd.read_sql(f\"SELECT COUNT(*) AS n FROM {table}\", conn).iloc[0][\"n\"]\n", + " print(f\"{table:20s}: {count:,} rows\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "542190d4", + "metadata": {}, + "outputs": [], + "source": [ + "# Preview the accounts table\n", + "pd.read_sql(\"SELECT * FROM accounts LIMIT 10\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "762a7f64", + "metadata": {}, + "outputs": [], + "source": [ + "# Preview the transactions table\n", + "pd.read_sql(\"SELECT * FROM transactions LIMIT 10\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85026e0c", + "metadata": {}, + "outputs": [], + "source": [ + "# Preview the account_transactions view\n", + "# Notice each transaction appears as both an IN row and an OUT row\n", + "sample_tx = pd.read_sql(\"SELECT transaction_id FROM transactions LIMIT 1\", conn).iloc[0][\"transaction_id\"]\n", + "\n", + "print(f\"Looking up transaction: {sample_tx}\\n\")\n", + "pd.read_sql(f\"SELECT * FROM account_transactions WHERE transaction_id = '{sample_tx}'\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d10ed91b", + "metadata": {}, + "outputs": [], + "source": [ + "conn.close()" + ] + }, + { + "cell_type": "markdown", + "id": "26917348", + "metadata": {}, + "source": [ + "## 4. The `ReadOnlySqlDatabase` Tool\n", + "\n", + "So far we've been using `sqlite3` directly — a regular connection that could run `DROP TABLE` or `DELETE` if we wanted. \n", + "\n", + "When an LLM agent runs SQL, we can't have it modifying data. The [`ReadOnlySqlDatabase`](https://github.com/VectorInstitute/eval-agents/blob/main/aieng-eval-agents/aieng/agent_evals/tools/sql_database.py) tool solves this with two layers of protection:\n", + "\n", + "1. **AST-level enforcement** — It parses the SQL into a syntax tree using [SQLGlot](https://sqlglot.com/) and rejects any query that contains write operations (`INSERT`, `UPDATE`, `DROP`, etc.), even if hidden inside a CTE or subquery.\n", + "2. **Row limits + timeouts** — It caps results at `max_rows` (default 100) and cancels slow queries, preventing runaway costs.\n", + "\n", + "The tool exposes exactly **two methods** that become the agent's \"tools\":\n", + "- `get_schema_info()` — returns the table/column names\n", + "- `execute(query)` — runs a SELECT and returns a markdown table string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7880a55", + "metadata": {}, + "outputs": [], + "source": [ + "db = ReadOnlySqlDatabase(\n", + " connection_uri=f\"sqlite:///{DB_PATH}\",\n", + " agent_name=\"NotebookExplorer\",\n", + " max_rows=10, # keep output short for this notebook\n", + ")\n", + "\n", + "print(\"Tool created successfully!\")\n", + "print(f\"Max rows: {db.max_rows}\")\n", + "print(f\"Agent name: {db.agent_name}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c44192f1", + "metadata": {}, + "source": [ + "### 4.1. Schema Discovery\n", + "\n", + "This is the first thing the agent does on every case — ask \"what tables exist and what columns do they have?\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a3f2ef3", + "metadata": {}, + "outputs": [], + "source": [ + "schema = db.get_schema_info()\n", + "print(schema)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f86ae1f8", + "metadata": {}, + "outputs": [], + "source": [ + "# You can also ask for a specific table only\n", + "print(db.get_schema_info(table_names=[\"transactions\"]))" + ] + }, + { + "cell_type": "markdown", + "id": "57446310", + "metadata": {}, + "source": [ + "### 4.2. Running Queries Through the Tool\n", + "\n", + "Notice that the output is a **markdown table string** — not a DataFrame. This is intentional: the agent receives it as plain text in its context window." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "671804f9", + "metadata": {}, + "outputs": [], + "source": [ + "result = db.execute(\"SELECT * FROM accounts LIMIT 5\")\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8c32cae", + "metadata": {}, + "outputs": [], + "source": [ + "# Aggregation query. This is the kind of query the agent would run\n", + "result = db.execute(\"\"\"\n", + " SELECT\n", + " account,\n", + " COUNT(*) AS tx_count,\n", + " COUNT(DISTINCT counterparty) AS unique_counterparties,\n", + " SUM(CASE WHEN direction='IN' THEN amount ELSE 0 END) AS total_in,\n", + " SUM(CASE WHEN direction='OUT' THEN amount ELSE 0 END) AS total_out\n", + " FROM account_transactions\n", + " GROUP BY account\n", + " ORDER BY tx_count DESC\n", + " LIMIT 10\n", + "\"\"\")\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "0243c2b6", + "metadata": {}, + "source": [ + "### 4.3. Safety Demo — Write Operations Are Blocked\n", + "\n", + "Let's verify the protection actually works. The tool should reject any write operation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3b97afe", + "metadata": {}, + "outputs": [], + "source": [ + "# Attempting a DELETE. This should be blocked\n", + "result = db.execute(\"DELETE FROM transactions WHERE 1=1\")\n", + "print(result) # Expect: \"Query Error: Security Violation...\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd9e05ca", + "metadata": {}, + "outputs": [], + "source": [ + "# Attempting a write hidden inside a CTE — also blocked\n", + "result = db.execute(\"\"\"\n", + " WITH cleanup AS (\n", + " DELETE FROM transactions WHERE 1=1\n", + " )\n", + " SELECT * FROM accounts LIMIT 1\n", + "\"\"\")\n", + "print(result) # Expect: \"Query Error: Security Violation...\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7565f1ea", + "metadata": {}, + "outputs": [], + "source": [ + "# Row limit enforcement — we set max_rows=10 above, so this won't return all rows\n", + "result = db.execute(\"SELECT * FROM transactions\")\n", + "# Check the last line — it should say \"Truncated at 10 rows\"\n", + "for line in result.split(\"\\n\")[-3:]:\n", + " print(line)" + ] + }, + { + "cell_type": "markdown", + "id": "6fae21ed", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "In this notebook you've seen:\n", + "\n", + "1. **The dataset** — the IBM Transactions for Anti Money Laundering (AML) dataset, its structure, and how to download and clean it.\n", + "2. **The database** — two tables (`accounts`, `transactions`) and a convenience view (`account_transactions`) storing synthetic AML transaction data.\n", + "3. **Manual exploration** — how to use `pandas` + `sqlite3` to query the data as a developer would.\n", + "4. **The `ReadOnlySqlDatabase` tool** — the safety-hardened wrapper the agent uses, with AST-level write blocking and row limits.\n", + "\n", + "**Next:** In Notebook 2, we'll instantiate the AML agent, explore how to give it tasks, then inspect its reasoning and tool call trace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9fc0daf", + "metadata": {}, + "outputs": [], + "source": [ + "db.close()\n", + "print(\"Done!\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/implementations/aml_investigation/02_running_the_agent.ipynb b/implementations/aml_investigation/02_running_the_agent.ipynb new file mode 100644 index 0000000..d47dd6e --- /dev/null +++ b/implementations/aml_investigation/02_running_the_agent.ipynb @@ -0,0 +1,530 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7fb27b941602401d91542211134fc71a", + "metadata": {}, + "source": [ + "# Case Files and Running the Agent\n", + "\n", + "In Notebook 1 we explored the raw database and the `ReadOnlySqlDatabase` tool.\n", + "Now we zoom out and ask: what problem is the agent actually solving, and how do we feed it a case?\n", + "\n", + "This notebook covers:\n", + "1. The real-world AML workflow we modelled\n", + "2. The data structures that represent a case\n", + "3. Why the evaluation dataset has four case types and what each one tests\n", + "4. How to generate cases from the raw data\n", + "5. Running a single case through the agent and inspecting its output\n", + "\n", + "---\n", + "\n", + "**Prerequisites:** Complete Notebook 1 first. The database must exist at `implementations/aml_investigation/data/aml_transactions.db`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c2ec10a", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "from aieng.agent_evals.aml_investigation.data import (\n", + " CaseFile,\n", + " CaseRecord,\n", + " GroundTruth,\n", + " LaunderingPattern,\n", + " build_cases,\n", + " download_dataset_file,\n", + " normalize_transactions_data,\n", + ")\n", + "from aieng.agent_evals.aml_investigation.task import AmlInvestigationTask\n", + "from dotenv import load_dotenv\n", + "\n", + "\n", + "# Setting the notebook directory to the project's root folder\n", + "if Path(\"\").absolute().name == \"eval-agents\":\n", + " print(f\"Notebook path is already the root path: {Path('').absolute()}\")\n", + "else:\n", + " os.chdir(Path(\"\").absolute().parent.parent)\n", + " print(f\"The notebook path has been set to: {Path('').absolute()}\")\n", + "\n", + "load_dotenv(verbose=True)" + ] + }, + { + "cell_type": "markdown", + "id": "acae54e37e7d407bbb7b55eff062a284", + "metadata": {}, + "source": [ + "## 1. Our Model of the Anti-Money Laundering Investigation Workflow\n", + "\n", + "In practice, AML investigations at financial institutions are more complex than what we model here. What we model is the core investigative loop: a transaction gets flagged, a case is opened, an analyst investigates, and the analyst produces a written determination.\n", + "\n", + "In our model, the workflow has three stages.\n", + "\n", + "First, an external alerting system flags a transaction. This could be a rules engine, an ML model, a law enforcement referral, or a routine sampling process. The system assigns a `trigger_label` to the case, which is a short string describing why the case was opened. Crucially, this label is noisy: it may be a strong signal (e.g. `FAN-OUT`, `LAW_ENFORCEMENT_REFERRAL`) or essentially no signal at all (e.g. `QA_SAMPLE`, `RANDOM_REVIEW`).\n", + "\n", + "Second, the case is opened with a structured record containing: a unique `case_id`, the flagged `seed_transaction_id`, the `seed_timestamp` (which marks the end of the investigation window), and a `window_start` timestamp (which marks how far back the analyst should look). The analyst is only expected to reason about events within that window.\n", + "\n", + "Third, the analyst investigates by querying the transaction database, identifies whether the activity is consistent with a laundering pattern, and produces a written output: a narrative summary, a verdict (`is_laundering`), a pattern classification, and the specific transaction IDs that form the suspicious chain.\n", + "\n", + "The agent mirrors this structure exactly. It receives the case record as a JSON object, queries the database, and returns a structured `AnalystOutput`." + ] + }, + { + "cell_type": "markdown", + "id": "9a63283cbaf04dbcab1f6479b197f3a8", + "metadata": {}, + "source": [ + "## 2. The Data Structures\n", + "\n", + "The agent's input and output are structured as Pydantic models. This allows us to enforce a schema at the model level, which simplifies prompt engineering and evaluation.\n", + "\n", + "**`CaseFile`** is what the agent receives. It contains only what a real analyst would be given at case open time: no ground truth, no answer.\n", + "\n", + "```python\n", + "class CaseFile(BaseModel):\n", + " case_id: str # unique identifier\n", + " seed_transaction_id: str # the flagged transaction\n", + " seed_timestamp: str # end of the investigation window\n", + " window_start: str # start of the investigation window\n", + " trigger_label: str # why the case was opened (may be noisy)\n", + "```\n", + "\n", + "**`GroundTruth`** records what actually happened. It is never shown to the agent. It is used only by the graders during evaluation.\n", + "\n", + "```python\n", + "class GroundTruth(BaseModel):\n", + " is_laundering: bool\n", + " pattern_type: LaunderingPattern # FAN-IN, FAN-OUT, CYCLE, ..., NONE\n", + " pattern_description: str\n", + " attempt_transaction_ids: str # comma-separated laundering chain\n", + "```\n", + "\n", + "**`AnalystOutput`** is what the agent must produce. Its schema is enforced at the model level via `output_schema`.\n", + "\n", + "```python\n", + "class AnalystOutput(BaseModel):\n", + " summary_narrative: str # the agent's reasoning\n", + " is_laundering: bool\n", + " pattern_type: LaunderingPattern\n", + " pattern_description: str\n", + " flagged_transaction_ids: str # the agent's identified laundering chain\n", + "```\n", + "\n", + "A **`CaseRecord`** bundles `input: CaseFile` and `expected_output: GroundTruth` together. This is the unit that goes into the Langfuse dataset. The `input` field is sent to the agent; the `expected_output` field is passed to the graders." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8dd0d8092fe74a7c96281538738b07e2", + "metadata": {}, + "outputs": [], + "source": [ + "# Demonstrate the structure manually\n", + "example_case = CaseRecord(\n", + " input=CaseFile(\n", + " case_id=\"demo-001\",\n", + " seed_transaction_id=\"txn-abc\",\n", + " seed_timestamp=\"2022-09-15T14:30:00\",\n", + " window_start=\"2022-09-01T00:00:00\",\n", + " trigger_label=\"QA_SAMPLE\", # low-signal: gives no hint about laundering\n", + " ),\n", + " expected_output=GroundTruth(\n", + " is_laundering=True,\n", + " pattern_type=LaunderingPattern.FAN_OUT,\n", + " pattern_description=\"One source dispersing funds to many destinations.\",\n", + " attempt_transaction_ids=\"txn-abc,txn-def,txn-ghi\",\n", + " ),\n", + ")\n", + "\n", + "print(\"--- Input (what the agent sees) ---\")\n", + "print(example_case.input.model_dump_json(indent=2))\n", + "\n", + "print(\"\\n--- Expected Output (hidden from the agent; used for grading) ---\")\n", + "print(example_case.expected_output.model_dump_json(indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "72eea5119410473aa328ad9291626812", + "metadata": {}, + "source": [ + "## 3. The Four Case Types\n", + "\n", + "A robust evaluation dataset needs to test more than just \"can the agent find laundering?\". We deliberately construct four case types, each probing a different failure mode.\n", + "\n", + "| Case type | `is_laundering` (ground truth) | `trigger_label` | What it tests |\n", + "|---|---|---|---|\n", + "| **True Positive** | `True` | Pattern name (e.g. `FAN-OUT`) | Can the agent correctly identify and describe a real laundering pattern? |\n", + "| **True Negative** | `False` | Low-signal (`QA_SAMPLE`, `RANDOM_REVIEW`, ...) | Can the agent correctly clear a benign case without over-investigating? |\n", + "| **False Positive** | `False` | High-signal (`ANOMALOUS_BEHAVIOR_ALERT`, `LAW_ENFORCEMENT_REFERRAL`, ...) | Can the agent resist a misleading trigger and avoid a false alarm? |\n", + "| **False Negative** | `True` | Low-signal (`QA_SAMPLE`, `RANDOM_REVIEW`, ...) | Can the agent find laundering even when the trigger provides no hint? |\n", + "\n", + "The false positive and false negative cases are the most diagnostic. They test whether the agent can reason independently rather than follow the trigger label.\n", + "\n", + "### How each type is built\n", + "\n", + "**True Positives** are parsed from the `Patterns.txt` file in the Kaggle dataset. This file records every known laundering attempt: the accounts involved, the exact transactions, and the pattern type. The `trigger_label` is set to the pattern name, simulating an alerting system that correctly identified the behaviour.\n", + "\n", + "**True Negatives** sample random benign transactions from the dataset. The `trigger_label` is set to one of `QA_SAMPLE`, `RANDOM_REVIEW`, `RETROSPECTIVE_REVIEW`, or `MODEL_MONITORING_SAMPLE`, realistic labels for a routine compliance review that carries no signal about laundering.\n", + "\n", + "**False Positives** are built from benign accounts with an unusually high transaction volume on a single day. High volume is a common heuristic alert trigger, so these cases look suspicious at first glance. The trigger label is a high-signal label like `ANOMALOUS_BEHAVIOR_ALERT`, but the ground truth is `is_laundering=False`.\n", + "\n", + "**False Negatives** are taken from additional laundering attempts beyond those used as True Positives. The key difference: the `trigger_label` is swapped to a low-signal review label, removing any hint. The agent must discover the laundering through its own investigation." + ] + }, + { + "cell_type": "markdown", + "id": "8edb47106e1a46a883d545849b8ab81b", + "metadata": {}, + "source": [ + "## 4. Generating Case Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10185d26023b46108eb7d9f57d49d2b3", + "metadata": {}, + "outputs": [], + "source": [ + "CASES_PATH = Path(\"implementations/aml_investigation/data/aml_cases.jsonl\")\n", + "\n", + "ILLICIT_RATIO = \"HI\" # \"HI\" or \"LI\"\n", + "TRANSACTIONS_SIZE = \"Small\" # \"Small\", \"Medium\", or \"Large\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8763a12b2bbd4a93a75aff182afb95dc", + "metadata": {}, + "outputs": [], + "source": [ + "# Run this cell only if cases do not exist yet.\n", + "# It downloads the dataset from Kaggle and may take a minute if files aren't cached\n", + "# locally.\n", + "\n", + "if not CASES_PATH.exists():\n", + " print(\"Downloading dataset files...\")\n", + " path_to_transc_csv = download_dataset_file(ILLICIT_RATIO, TRANSACTIONS_SIZE, \"Trans.csv\")\n", + " path_to_patterns_txt = download_dataset_file(ILLICIT_RATIO, TRANSACTIONS_SIZE, \"Patterns.txt\")\n", + " print(\"Download complete.\")\n", + "\n", + " print(\"Normalizing transactions...\")\n", + " transc_df = pd.read_csv(path_to_transc_csv, dtype_backend=\"pyarrow\")\n", + " transc_df = normalize_transactions_data(transc_df)\n", + " print(f\"Loaded {len(transc_df):,} transactions.\")\n", + "\n", + " print(\"Building cases...\")\n", + " cases = build_cases(\n", + " path_to_patterns_txt,\n", + " transc_df,\n", + " num_laundering_cases=5,\n", + " num_normal_cases=5,\n", + " num_false_negative_cases=3,\n", + " num_false_positive_cases=3,\n", + " lookback_days=10, # how far back from the seed transaction the agent should investigate\n", + " )\n", + " print(f\"Built {len(cases)} cases.\")\n", + "\n", + " CASES_PATH.parent.mkdir(parents=True, exist_ok=True)\n", + " with CASES_PATH.open(\"w\", encoding=\"utf-8\") as f:\n", + " for record in cases:\n", + " f.write(record.model_dump_json() + \"\\n\")\n", + " print(f\"Wrote cases to {CASES_PATH}\")\n", + "else:\n", + " print(f\"Cases already exist at {CASES_PATH}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7623eae2785240b9bd12b16a66d81610", + "metadata": {}, + "outputs": [], + "source": [ + "raw_cases = [json.loads(line) for line in CASES_PATH.read_text().splitlines() if line.strip()]\n", + "cases = [CaseRecord.model_validate(raw_case) for raw_case in raw_cases]\n", + "\n", + "print(f\"Total cases loaded: {len(cases)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cdc8c89c7104fffa095e18ddfef8986", + "metadata": {}, + "outputs": [], + "source": [ + "# Summary table of all cases\n", + "summary = pd.DataFrame(\n", + " [\n", + " {\n", + " \"case_id\": case.input.case_id[:12] + \"...\",\n", + " \"trigger_label\": case.input.trigger_label,\n", + " \"is_laundering\": case.expected_output.is_laundering,\n", + " \"pattern_type\": case.expected_output.pattern_type.value,\n", + " \"window_days\": (pd.Timestamp(case.input.seed_timestamp) - pd.Timestamp(case.input.window_start)).days,\n", + " }\n", + " for case in cases\n", + " ]\n", + ")\n", + "\n", + "print(summary)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b118ea5561624da68c537baed56e602f", + "metadata": {}, + "outputs": [], + "source": [ + "# Classify each case into one of the four types\n", + "_LOW_SIGNAL = {\"QA_SAMPLE\", \"RANDOM_REVIEW\", \"RETROSPECTIVE_REVIEW\", \"MODEL_MONITORING_SAMPLE\"}\n", + "_HIGH_SIGNAL = {\"ANOMALOUS_BEHAVIOR_ALERT\", \"LAW_ENFORCEMENT_REFERRAL\", \"EXTERNAL_TIP\"}\n", + "_PATTERN_LABELS = {p.value for p in LaunderingPattern if p != LaunderingPattern.NONE}\n", + "\n", + "\n", + "def classify_case(case: CaseRecord) -> str:\n", + " \"\"\"Classify a case record.\"\"\"\n", + " label = case.input.trigger_label\n", + " is_laundering = case.expected_output.is_laundering\n", + " if label in _PATTERN_LABELS and is_laundering:\n", + " return \"True Positive\"\n", + " if label in _LOW_SIGNAL and not is_laundering:\n", + " return \"True Negative\"\n", + " if (label in _HIGH_SIGNAL or label in _PATTERN_LABELS) and not is_laundering:\n", + " return \"False Positive\"\n", + " if label in _LOW_SIGNAL and is_laundering:\n", + " return \"False Negative\"\n", + " return \"Other\"\n", + "\n", + "\n", + "summary[\"case_type\"] = [classify_case(case) for case in cases]\n", + "print(summary[\"case_type\"].value_counts().to_string())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938c804e27f84196a10c8828c723f798", + "metadata": {}, + "outputs": [], + "source": [ + "# Print one representative example of each case type\n", + "for case_type in [\"True Positive\", \"True Negative\", \"False Positive\", \"False Negative\"]:\n", + " idx = summary[summary[\"case_type\"] == case_type].index\n", + " if len(idx) == 0:\n", + " print(f\"[{case_type}] no examples in this dataset\\n\")\n", + " continue\n", + " case = cases[idx[0]]\n", + " print(f\"=== {case_type} ===\")\n", + " print(f\" trigger_label : {case.input.trigger_label}\")\n", + " print(f\" is_laundering : {case.expected_output.is_laundering}\")\n", + " print(f\" pattern_type : {case.expected_output.pattern_type.value}\")\n", + " print(f\" window : {case.input.window_start} to {case.input.seed_timestamp}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "id": "504fb2a444614c0babb325280ed9130a", + "metadata": {}, + "source": [ + "## 5. The Agent\n", + "\n", + "The agent is a Google ADK `LlmAgent` configured with three things:\n", + "\n", + "- A detailed system prompt (`ANALYST_PROMPT`) describing the investigation workflow, a strategy for querying the database efficiently (start with aggregates, expand selectively), and the laundering typologies to look for.\n", + "- Two tools: `get_schema_info()` and `execute(query)` from `ReadOnlySqlDatabase`, the same ones explored in Notebook 1.\n", + "- A structured output schema that enforces `AnalystOutput`, so the final response is always a valid, parseable object.\n", + "\n", + "`AmlInvestigationTask` is a thin wrapper around the agent that:\n", + "1. Serializes the `CaseFile` to JSON and sends it as the user message\n", + "2. Streams the agent's response via the ADK runner\n", + "3. Extracts the final response and parses it into an `AnalystOutput` object\n", + "\n", + "It implements the `TaskFunction` protocol expected by the Langfuse experiment harness, so it can be passed directly to `run_experiment`. We will use it that way in Notebook 3." + ] + }, + { + "cell_type": "markdown", + "id": "59bbdb311c014d738909a11f9e486628", + "metadata": {}, + "source": [ + "## 6. Running a Single Case\n", + "\n", + "Let's run one case manually and watch the agent work.\n", + "\n", + "> **Note:** This requires a `.env` file with valid `GOOGLE_API_KEY`, `LANGFUSE_PUBLIC_KEY`, `LANGFUSE_SECRET_KEY`, and `LANGFUSE_HOST` values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a65eabff63a45729fe45fb5ade58bdc", + "metadata": {}, + "outputs": [], + "source": [ + "task = AmlInvestigationTask()\n", + "print(f\"Agent : {task._agent.name}\")\n", + "print(f\"Model : {task._agent.model}\")\n", + "print(f\"Tools : {[tool.name for tool in task._agent.tools]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3933fab20d04ec698c2621248eb3be0", + "metadata": {}, + "outputs": [], + "source": [ + "# Pick a case type to run.\n", + "# Try all four types to see how the agent behaves on each.\n", + "CASE_TYPE_TO_RUN = \"True Positive\" # options: \"True Positive\", \"True Negative\", \"False Positive\", \"False Negative\"\n", + "\n", + "idx = summary[summary[\"case_type\"] == CASE_TYPE_TO_RUN].index\n", + "if len(idx) == 0:\n", + " raise ValueError(f\"No cases of type '{CASE_TYPE_TO_RUN}' found.\")\n", + "\n", + "selected_case = cases[idx[0]]\n", + "print(f\"Running case : {selected_case.input.case_id}\")\n", + "print(f\" Type : {CASE_TYPE_TO_RUN}\")\n", + "print(f\" Trigger : {selected_case.input.trigger_label}\")\n", + "print(f\" Window : {selected_case.input.window_start} to {selected_case.input.seed_timestamp}\")\n", + "print()\n", + "print(\"--- Input sent to the agent ---\")\n", + "print(selected_case.input.model_dump_json(indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4dd4641cc4064e0191573fe9c69df29b", + "metadata": {}, + "outputs": [], + "source": [ + "# Run the agent. This makes live LLM calls and may take 2-3 minutes.\n", + "agent_output = await task(item={\"input\": selected_case.input.model_dump()})\n", + "\n", + "if agent_output is None:\n", + " print(\"Agent returned no output. Check your credentials and that the database exists.\")\n", + "else:\n", + " print(\"\\n--- Agent Output ---\")\n", + " print(agent_output)\n", + "\n", + " print(\"Agent finished.\")" + ] + }, + { + "cell_type": "markdown", + "id": "8309879909854d7188b41380fd92a7c3", + "metadata": {}, + "source": [ + "## 7. Comparing Agent Output to Ground Truth\n", + "\n", + "Before introducing automated graders, let's compare the output by hand." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ed186c9a28b402fb0bc4494df01f08d", + "metadata": {}, + "outputs": [], + "source": [ + "if agent_output is not None:\n", + " ground_truth = selected_case.expected_output\n", + "\n", + " is_laundering_match = ground_truth.is_laundering == agent_output[\"is_laundering\"]\n", + " pattern_match = ground_truth.pattern_type.value == agent_output[\"pattern_type\"]\n", + "\n", + " ground_truth_transaction_ids = {i.strip() for i in ground_truth.attempt_transaction_ids.split(\",\") if i.strip()}\n", + " agent_flagged_ids = {i.strip() for i in agent_output[\"flagged_transaction_ids\"].split(\",\") if i.strip()}\n", + "\n", + " print(f\"{'Field':<30} {'Ground Truth':<25} {'Agent Output':<25} {'Match?'}\")\n", + " print(\"-\" * 90)\n", + " print(\n", + " f\"{'is_laundering':<30} {str(ground_truth.is_laundering):<25} {str(agent_output['is_laundering']):<25} {'OK' if is_laundering_match else 'WRONG'}\"\n", + " )\n", + " print(\n", + " f\"{'pattern_type':<30} {ground_truth.pattern_type.value:<25} {agent_output['pattern_type'].value:<25} {'OK' if pattern_match else 'WRONG'}\"\n", + " )\n", + " print()\n", + "\n", + " print(f\"Ground truth tx IDs : {ground_truth.attempt_transaction_ids or '(none)'}\")\n", + " print(f\"Agent flagged tx IDs : {agent_output['flagged_transaction_ids'] or '(none)'}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb1e1581032b452c9409d6c6813c49d1", + "metadata": {}, + "outputs": [], + "source": [ + "if agent_output is not None:\n", + " print(\"=== Agent Summary Narrative ===\")\n", + " print(agent_output[\"summary_narrative\"])" + ] + }, + { + "cell_type": "markdown", + "id": "379cbbc1e968416e875cc15c1202d7eb", + "metadata": {}, + "source": [ + "## 8. Try the Other Case Types\n", + "\n", + "Go back to the cell in section 6 that sets `CASE_TYPE_TO_RUN` and change it to each of the four types. A few things to pay attention to:\n", + "\n", + "- **False Positive**: The `trigger_label` suggests something suspicious. Does the agent correctly clear the case, or does it follow the trigger?\n", + "- **False Negative**: The `trigger_label` is noise. Does the agent still find the laundering pattern through its own investigation?\n", + "- **True Negative**: A completely benign case. Does the agent close it cleanly without over-reaching?\n", + "\n", + "In the **next** notebook, we will introduce automated graders that quantify the agent's performance across these dimensions at scale." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "277c27b1587741f2af2001be3712ef0d", + "metadata": {}, + "outputs": [], + "source": [ + "await task.close()\n", + "print(\"Task closed.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/implementations/aml_investigation/03_evaluation.ipynb b/implementations/aml_investigation/03_evaluation.ipynb new file mode 100644 index 0000000..571f55d --- /dev/null +++ b/implementations/aml_investigation/03_evaluation.ipynb @@ -0,0 +1,429 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The Evaluation Pipeline\n", + "\n", + "In Notebook 2 we built case files and ran one case through the agent by hand. This notebook automates that process across the entire dataset and attaches structured graders at three levels: per-item, per-trace, and per-run.\n", + "\n", + "This notebook covers:\n", + "1. Uploading the case dataset to Langfuse\n", + "2. What each grader measures and why\n", + "3. Running the full experiment\n", + "4. Inspecting results in Python and in the Langfuse UI\n", + "\n", + "---\n", + "\n", + "**Prerequisites:** Complete Notebooks 1 and 2. The case file must exist at `implementations/aml_investigation/data/aml_cases.jsonl`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "from aieng.agent_evals.aml_investigation.graders.item import item_level_deterministic_grader\n", + "from aieng.agent_evals.aml_investigation.graders.run import run_level_grader\n", + "from aieng.agent_evals.aml_investigation.graders.trace import trace_deterministic_grader\n", + "from aieng.agent_evals.aml_investigation.task import AmlInvestigationTask\n", + "from aieng.agent_evals.evaluation import run_experiment_with_trace_evals\n", + "from aieng.agent_evals.langfuse import upload_dataset_to_langfuse\n", + "from dotenv import load_dotenv\n", + "\n", + "\n", + "# Setting the notebook directory to the project's root folder\n", + "if Path(\"\").absolute().name == \"eval-agents\":\n", + " print(f\"Notebook path is already the root path: {Path('').absolute()}\")\n", + "else:\n", + " os.chdir(Path(\"\").absolute().parent.parent)\n", + " print(f\"The notebook path has been set to: {Path('').absolute()}\")\n", + "\n", + "CASES_PATH = Path(\"implementations/aml_investigation/data/aml_cases.jsonl\")\n", + "DATASET_NAME = \"aml-investigation-eval\"\n", + "\n", + "assert CASES_PATH.exists(), f\"Cases file not found at {CASES_PATH}. Run Notebook 2 first.\"\n", + "print(f\"Found {sum(1 for line in CASES_PATH.read_text().splitlines() if line.strip())} cases.\")\n", + "\n", + "load_dotenv(verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Uploading the Dataset to Langfuse\n", + "\n", + "Langfuse acts as the backbone of our evaluation pipeline. Each case file becomes a dataset item in Langfuse: the `input` field (the `CaseFile`) is what gets sent to the agent, and the `expected_output` field (the `GroundTruth`) is stored separately and made available to the graders.\n", + "\n", + "`upload_dataset_to_langfuse` reads the JSONL file, creates the dataset if it does not already exist, and upserts items using a deterministic content-based ID. Running this cell twice is safe: existing items are updated in place rather than duplicated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await upload_dataset_to_langfuse(dataset_path=str(CASES_PATH), dataset_name=DATASET_NAME)\n", + "\n", + "print(f\"Dataset '{DATASET_NAME}' is ready in Langfuse.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. The Graders\n", + "\n", + "We use three layers of graders. Each layer answers a different question about the agent.\n", + "\n", + "### 2.1 Item-level graders\n", + "\n", + "Item-level graders run once per case and compare the agent's `AnalystOutput` to the `GroundTruth`. They are deterministic: no LLM judge is involved.\n", + "\n", + "| Metric | What it measures |\n", + "|---|---|\n", + "| `is_laundering_correct` | Whether the agent's verdict (laundering or not) matches ground truth |\n", + "| `pattern_type_correct` | Whether the agent named the exact correct pattern (e.g. `FAN-OUT`, `NONE`) |\n", + "| `non_laundering_pattern_consistent` | When the agent predicts benign, whether the pattern is `NONE` |\n", + "| `non_laundering_flags_empty` | When the agent predicts benign, whether no transaction IDs are flagged |\n", + "| `id_precision_like` | Of the flagged IDs, how many were correct minus how many were wrong, normalized by count |\n", + "| `id_coverage` | Of the ground truth laundering IDs, what fraction the agent correctly identified |\n", + "\n", + "The two consistency checks (`non_laundering_pattern_consistent` and `non_laundering_flags_empty`) probe a specific failure mode: an agent that correctly says \"not laundering\" but still outputs a suspicious-looking pattern name or a non-empty list of flagged IDs. These outputs would confuse a downstream consumer even if the verdict is right.\n", + "\n", + "### 2.2 Trace-level graders\n", + "\n", + "Trace-level graders inspect the agent's SQL tool calls from the Langfuse trace, not just the final output. They run in a second pass after the experiment finishes, once all traces have been ingested.\n", + "\n", + "| Metric | What it measures |\n", + "|---|---|\n", + "| `trace_has_sql_queries` | Whether the agent issued at least one database query |\n", + "| `trace_read_only_query_check` | Whether all queries were read-only (no `INSERT`, `UPDATE`, `DROP`, etc.) |\n", + "| `trace_window_filter_present` | Whether at least one query referenced the case investigation window |\n", + "| `trace_window_violation_count` | How many queries used timestamps outside the case investigation window |\n", + "| `trace_redundant_query_ratio` | Fraction of queries that were exact duplicates of a previous query in the same run |\n", + "\n", + "These metrics catch issues that would be invisible from the final output alone. An agent could produce a perfect verdict by hallucinating rather than querying, issue write queries, look at data outside the permitted window, or waste its context budget on redundant queries.\n", + "\n", + "### 2.3 Run-level graders\n", + "\n", + "Run-level graders receive all item results after the experiment finishes and compute aggregate classification metrics across the full dataset.\n", + "\n", + "| Metric | What it measures |\n", + "|---|---|\n", + "| `is_laundering_precision` | Precision for laundering detection across all cases |\n", + "| `is_laundering_recall` | Recall for laundering detection across all cases |\n", + "| `is_laundering_f1` | F1 score for laundering detection |\n", + "| `pattern_type_macro_f1` | Macro-averaged F1 across all pattern types |\n", + "| `pattern_type_confusion_matrix` | Full confusion matrix over pattern types (stored in metadata) |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Running the Experiment\n", + "\n", + "The experiment runs in two passes.\n", + "\n", + "Pass 1 executes the task over every dataset item up to `max_concurrency` cases at a time. For each item, the `AmlInvestigationTask` sends the `CaseFile` JSON to the agent, streams its response, and parses it into an `AnalystOutput`. The item-level graders score immediately after each item finishes, while the run-level graders wait until all items are done and then compute aggregate metrics.\n", + "\n", + "Pass 2 waits for all traces to be fully ingested by Langfuse, then runs the trace-level graders. This second pass is necessary because trace data arrives asynchronously: the agent may finish producing output before all intermediate tool-call spans have been written to Langfuse.\n", + "\n", + "> **Note:** This cell makes live LLM calls for every case in the dataset. With 16 cases and `max_concurrency=4`, expect 10 to 15 minutes total depending on model latency." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "task = AmlInvestigationTask()\n", + "\n", + "result = run_experiment_with_trace_evals(\n", + " DATASET_NAME,\n", + " name=\"aml-investigation-baseline\",\n", + " task=task,\n", + " evaluators=[item_level_deterministic_grader],\n", + " trace_evaluators=[trace_deterministic_grader],\n", + " run_evaluators=[run_level_grader],\n", + " description=\"Baseline AML investigation agent evaluation.\",\n", + " max_concurrency=4,\n", + ")\n", + "\n", + "print(\"Experiment complete.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await task.close()\n", + "print(\"Task closed.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Inspecting Results\n", + "\n", + "The `result` object gives programmatic access to every item-level score and to the trace evaluations. Run-level aggregate metrics are computed by the `run_level_grader` and pushed to the Langfuse experiment run; the most convenient place to read them is the Langfuse UI.\n", + "\n", + "### 4.1 Item-level scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rows = []\n", + "for item_result in result.experiment.item_results:\n", + " item = item_result.item\n", + " case_input = item.get(\"input\") if isinstance(item, dict) else item.input\n", + " expected = item.get(\"expected_output\") if isinstance(item, dict) else item.expected_output\n", + "\n", + " row = {\n", + " \"case_id\": case_input.get(\"case_id\", \"\")[:12] + \"...\",\n", + " \"trigger_label\": case_input.get(\"trigger_label\", \"\"),\n", + " \"gt_laundering\": expected.get(\"is_laundering\"),\n", + " \"gt_pattern\": expected.get(\"pattern_type\"),\n", + " }\n", + "\n", + " if item_result.output:\n", + " output = item_result.output\n", + " row[\"pred_laundering\"] = output.get(\"is_laundering\")\n", + " row[\"pred_pattern\"] = output.get(\"pattern_type\").value\n", + " else:\n", + " row[\"pred_laundering\"] = None\n", + " row[\"pred_pattern\"] = None\n", + "\n", + " for evaluation in item_result.evaluations or []:\n", + " row[evaluation.name] = round(evaluation.value, 3) if evaluation.value is not None else None\n", + "\n", + " rows.append(row)\n", + "\n", + "item_df = pd.DataFrame(rows)\n", + "print(item_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mean score per metric across all cases\n", + "score_cols = [\n", + " \"is_laundering_correct\",\n", + " \"pattern_type_correct\",\n", + " \"non_laundering_pattern_consistent\",\n", + " \"non_laundering_flags_empty\",\n", + " \"id_precision_like\",\n", + " \"id_coverage\",\n", + "]\n", + "available = [c for c in score_cols if c in item_df.columns]\n", + "item_df[available].mean().rename(\"mean\").to_frame()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.2 Scores by case type\n", + "\n", + "Breaking down `is_laundering_correct` by case type shows where the agent struggles. True positives and true negatives are the easiest; false positives and false negatives are where most agents lose points." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "_LOW_SIGNAL = {\"QA_SAMPLE\", \"RANDOM_REVIEW\", \"RETROSPECTIVE_REVIEW\", \"MODEL_MONITORING_SAMPLE\"}\n", + "_HIGH_SIGNAL = {\"ANOMALOUS_BEHAVIOR_ALERT\", \"LAW_ENFORCEMENT_REFERRAL\", \"EXTERNAL_TIP\"}\n", + "_PATTERN_LABELS = {\n", + " \"FAN-IN\",\n", + " \"FAN-OUT\",\n", + " \"CYCLE\",\n", + " \"GATHER-SCATTER\",\n", + " \"SCATTER-GATHER\",\n", + " \"STACK\",\n", + " \"RANDOM\",\n", + " \"BIPARTITE\",\n", + "}\n", + "\n", + "\n", + "def classify_row(row):\n", + " \"\"\"Classify a case based on its trigger label and laundering status.\"\"\"\n", + " label = row[\"trigger_label\"]\n", + " is_laundering = row[\"gt_laundering\"]\n", + " if label in _PATTERN_LABELS and is_laundering:\n", + " return \"True Positive\"\n", + " if label in _LOW_SIGNAL and not is_laundering:\n", + " return \"True Negative\"\n", + " if (label in _HIGH_SIGNAL or label in _PATTERN_LABELS) and not is_laundering:\n", + " return \"False Positive\"\n", + " if label in _LOW_SIGNAL and is_laundering:\n", + " return \"False Negative\"\n", + " return \"Other\"\n", + "\n", + "\n", + "item_df[\"case_type\"] = item_df.apply(classify_row, axis=1)\n", + "\n", + "if \"is_laundering_correct\" in item_df.columns:\n", + " breakdown = (\n", + " item_df.groupby(\"case_type\")[[\"is_laundering_correct\", \"pattern_type_correct\", \"id_coverage\"]].mean().round(3)\n", + " )\n", + " print(breakdown.to_string())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.3 Trace-level scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if result.trace_evaluations:\n", + " trace_rows = []\n", + " for trace_id, evaluations in result.trace_evaluations.evaluations_by_trace_id.items():\n", + " row = {\"trace_id\": trace_id[:12] + \"...\"}\n", + " for evaluation in evaluations or []:\n", + " row[evaluation.name] = round(evaluation.value, 3) if evaluation.value is not None else None\n", + " trace_rows.append(row)\n", + "\n", + " trace_df = pd.DataFrame(trace_rows)\n", + " print(trace_df.to_string(index=False))\n", + "else:\n", + " print(\"No trace evaluations available.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mean trace scores across all cases\n", + "trace_score_cols = [\n", + " \"trace_has_sql_queries\",\n", + " \"trace_read_only_query_check\",\n", + " \"trace_window_filter_present\",\n", + " \"trace_window_violation_count\",\n", + " \"trace_redundant_query_ratio\",\n", + "]\n", + "if result.trace_evaluations:\n", + " available_trace = [c for c in trace_score_cols if c in trace_df.columns]\n", + " print(trace_df[available_trace].mean().rename(\"mean\").to_frame())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.4 Run-level aggregate metrics\n", + "\n", + "The run-level grader computes precision, recall, F1 for laundering detection and macro F1 for pattern classification across all cases. These are uploaded to Langfuse and visible in the experiment run summary. You can also extract them from the `ExperimentResult` object returned by the first pass." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"{'Metric':<40} {'Value'}\")\n", + "print(\"-\" * 50)\n", + "for evaluation in result.experiment.run_evaluations:\n", + " if evaluation.name != \"pattern_type_confusion_matrix\": # shown separately below\n", + " print(f\"{evaluation.name:<40} {evaluation.value:.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pattern confusion matrix\n", + "cm_eval = next((e for e in result.experiment.run_evaluations if e.name == \"pattern_type_confusion_matrix\"), None)\n", + "if cm_eval and cm_eval.metadata:\n", + " labels = cm_eval.metadata.get(\"labels\", [])\n", + " matrix = cm_eval.metadata.get(\"matrix\", [])\n", + " if labels and matrix:\n", + " cm_df = pd.DataFrame(matrix, index=labels, columns=labels)\n", + " cm_df.index.name = \"actual \\\\ predicted\"\n", + " print(cm_df.to_string())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Exploring Traces in the Langfuse UI\n", + "\n", + "The metrics above tell you _what_ the agent got right or wrong, but not _why_. To understand reasoning failures, the Langfuse UI is the right tool. Each experiment item links to the full trace, where you can see every SQL query the agent issued, the model's intermediate reasoning steps, and where in the investigation it went off track.\n", + "\n", + "A few things worth looking at in the UI:\n", + "\n", + "- **False negative cases** where `is_laundering_correct = 0`: did the agent query the database at all? Did it query the right accounts? Did it stop too early?\n", + "- **False positive cases** where `is_laundering_correct = 0`: did the agent over-weight the `trigger_label`? Did it find a real pattern or hallucinate one?\n", + "- **Cases with high `trace_redundant_query_ratio`**: the agent is burning context on repeated queries, which may indicate the prompt strategy for query budgeting is not working.\n", + "- **Cases with `trace_window_violation_count > 0`**: the agent is looking at transactions outside the permitted window, which would be a compliance issue in a real deployment.\n", + "\n", + "## 6. Iterating on the Agent\n", + "\n", + "The evaluation pipeline is designed to make iteration straightforward. The dataset in Langfuse is persistent: once uploaded, you do not need to re-upload it. To evaluate a modified agent, call `run_experiment_with_trace_evals` again with a new `name` argument. Langfuse will create a new experiment run and you can compare runs side by side in the UI.\n", + "\n", + "Common levers to explore:\n", + "\n", + "- **System prompt changes**: edit `ANALYST_PROMPT` in `agent.py` and re-run the experiment. For example, adjust the query strategy section to reduce redundant queries, or revise the typology descriptions to improve pattern classification.\n", + "- **Temperature and sampling**: pass `temperature=0.0` to `create_aml_investigation_agent` for more deterministic outputs, or increase it to probe sensitivity.\n", + "- **Lookback window**: change `lookback_days` when generating cases to make investigation windows wider or narrower and see how the agent adapts.\n", + "- **Case mix**: adjust the ratio of true positives, false positives, and false negatives in `build_cases` to stress-test specific failure modes." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a1f98b86534549a3cb79d72f377296d2ef4e33cc Mon Sep 17 00:00:00 2001 From: fcogidi <41602287+fcogidi@users.noreply.github.com> Date: Fri, 20 Feb 2026 11:34:11 -0500 Subject: [PATCH 2/2] Update variable names and remove `dtype_backend` for consistency --- .../aml_investigation/02_running_the_agent.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/implementations/aml_investigation/02_running_the_agent.ipynb b/implementations/aml_investigation/02_running_the_agent.ipynb index d47dd6e..101ec37 100644 --- a/implementations/aml_investigation/02_running_the_agent.ipynb +++ b/implementations/aml_investigation/02_running_the_agent.ipynb @@ -215,19 +215,19 @@ "\n", "if not CASES_PATH.exists():\n", " print(\"Downloading dataset files...\")\n", - " path_to_transc_csv = download_dataset_file(ILLICIT_RATIO, TRANSACTIONS_SIZE, \"Trans.csv\")\n", + " path_to_transactions_csv = download_dataset_file(ILLICIT_RATIO, TRANSACTIONS_SIZE, \"Trans.csv\")\n", " path_to_patterns_txt = download_dataset_file(ILLICIT_RATIO, TRANSACTIONS_SIZE, \"Patterns.txt\")\n", " print(\"Download complete.\")\n", "\n", " print(\"Normalizing transactions...\")\n", - " transc_df = pd.read_csv(path_to_transc_csv, dtype_backend=\"pyarrow\")\n", - " transc_df = normalize_transactions_data(transc_df)\n", - " print(f\"Loaded {len(transc_df):,} transactions.\")\n", + " transactions_df = pd.read_csv(path_to_transactions_csv)\n", + " transactions_df = normalize_transactions_data(transactions_df)\n", + " print(f\"Loaded {len(transactions_df):,} transactions.\")\n", "\n", " print(\"Building cases...\")\n", " cases = build_cases(\n", " path_to_patterns_txt,\n", - " transc_df,\n", + " transactions_df,\n", " num_laundering_cases=5,\n", " num_normal_cases=5,\n", " num_false_negative_cases=3,\n",