From d57e914645e8c30b0f5a85a8036d804cda7d339a Mon Sep 17 00:00:00 2001
From: JY Tan <jy8230@gmail.com>
Date: Wed, 14 Jan 2026 20:34:51 -0800
Subject: [PATCH 1/4] Commit

---
 benchmarks/.gitignore                         |   5 +
 benchmarks/PROFILING.md                       | 212 +++++++++
 benchmarks/README.md                          | 161 +++++++
 benchmarks/bench/__init__.py                  |   1 +
 benchmarks/bench/common.py                    | 227 ++++++++++
 benchmarks/bench/fixed_qps_latency.py         | 258 +++++++++++
 benchmarks/bench/realistic_workload.py        | 300 +++++++++++++
 benchmarks/bench/resource_monitor.py          | 202 +++++++++
 benchmarks/bench/result_utils.py              | 337 ++++++++++++++
 benchmarks/bench/sdk_active.py                |  49 ++
 .../bench/sdk_active_with_transforms.py       |  71 +++
 benchmarks/bench/sdk_disabled.py              |  16 +
 benchmarks/bench/sdk_sampling_rates.py        | 108 +++++
 benchmarks/compare_benchmarks.py              | 275 ++++++++++++
 benchmarks/profile/.gitignore                 |   2 +
 benchmarks/profile/profile.sh                 | 135 ++++++
 benchmarks/profile/simple_profile.py          | 114 +++++
 benchmarks/run_benchmarks.sh                  |  47 ++
 benchmarks/server/__init__.py                 |   1 +
 benchmarks/server/test_server.py              | 420 ++++++++++++++++++
 20 files changed, 2941 insertions(+)
 create mode 100644 benchmarks/.gitignore
 create mode 100644 benchmarks/PROFILING.md
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/bench/__init__.py
 create mode 100644 benchmarks/bench/common.py
 create mode 100644 benchmarks/bench/fixed_qps_latency.py
 create mode 100644 benchmarks/bench/realistic_workload.py
 create mode 100644 benchmarks/bench/resource_monitor.py
 create mode 100644 benchmarks/bench/result_utils.py
 create mode 100644 benchmarks/bench/sdk_active.py
 create mode 100644 benchmarks/bench/sdk_active_with_transforms.py
 create mode 100644 benchmarks/bench/sdk_disabled.py
 create mode 100644 benchmarks/bench/sdk_sampling_rates.py
 create mode 100644 benchmarks/compare_benchmarks.py
 create mode 100644 benchmarks/profile/.gitignore
 create mode 100755 benchmarks/profile/profile.sh
 create mode 100644 benchmarks/profile/simple_profile.py
 create mode 100755 benchmarks/run_benchmarks.sh
 create mode 100644 benchmarks/server/__init__.py
 create mode 100644 benchmarks/server/test_server.py

diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
new file mode 100644
index 0000000..260fbf0
--- /dev/null
+++ b/benchmarks/.gitignore
@@ -0,0 +1,5 @@
+# Benchmark results (regenerated each run)
+results/
+
+# Trace directories created during benchmarks
+.benchmark-traces*/
diff --git a/benchmarks/PROFILING.md b/benchmarks/PROFILING.md
new file mode 100644
index 0000000..2e19ea0
--- /dev/null
+++ b/benchmarks/PROFILING.md
@@ -0,0 +1,212 @@
+# Profiling the Drift Python SDK
+
+This document explains how to profile the SDK to understand where performance overhead comes from.
+
+## Quick Start
+
+```bash
+cd /path/to/drift-python-sdk
+
+# Run cProfile (recommended starting point)
+./benchmarks/profile/profile.sh cprofile
+```
+
+## Available Profilers
+
+### 1. cProfile (Built-in, Deterministic)
+
+**Best for:** Understanding call counts and cumulative time per function.
+
+```bash
+./benchmarks/profile/profile.sh cprofile
+```
+
+This runs the profiling workload and outputs:
+
+- A `.prof` file in `benchmarks/profile/results/`
+- Summary of top functions by cumulative and total time
+
+**View interactively with snakeviz:**
+
+```bash
+pip install snakeviz
+snakeviz benchmarks/profile/results/cprofile_*.prof
+```
+
+Opens an interactive sunburst diagram in your browser showing the call hierarchy.
+
+### 2. py-spy (Sampling, Flame Graphs)
+
+**Best for:** Low-overhead profiling and classic flame graph visualization.
+
+```bash
+# Requires sudo on macOS
+sudo ./benchmarks/profile/profile.sh pyspy
+```
+
+Generates an SVG flame graph that you can open in any browser.
+
+**Install:**
+
+```bash
+pip install py-spy
+# Or on macOS: brew install py-spy
+```
+
+### 3. Scalene (CPU + Memory)
+
+**Best for:** Understanding both CPU time and memory allocation per line.
+
+```bash
+pip install scalene
+./benchmarks/profile/profile.sh scalene
+```
+
+Generates an HTML report with line-by-line CPU and memory breakdown.
+
+### 4. VizTracer (Timeline)
+
+**Best for:** Seeing the sequence and timing of function calls over time.
+
+```bash
+pip install viztracer
+./benchmarks/profile/profile.sh viztracer
+```
+
+**View the trace:**
+
+```bash
+vizviewer benchmarks/profile/results/viztracer_*.json
+```
+
+Opens a Chrome DevTools-style timeline visualization.
+
+## Profile Analysis Results
+
+Based on profiling 500 HTTP requests through the SDK:
+
+### Top Overhead Sources
+
+| Function | Time/Call | Description |
+|----------|-----------|-------------|
+| `span_serialization.clean_span_to_proto` | ~1.7ms | Converting spans to protobuf format |
+| `td_span_processor.on_end` | ~2.1ms | Processing spans when they complete |
+| `handler.finalize_wsgi_span` | ~2.2ms | Finalizing HTTP/WSGI spans |
+| `otel_converter.otel_span_to_clean_span_data` | ~0.4ms | Converting OpenTelemetry spans |
+
+### Key Findings
+
+1. **Span serialization is the biggest bottleneck**
+   - `_dict_to_struct`, `_value_to_proto`, `_json_schema_to_proto` are called recursively
+   - Converting rich span data to protobuf is inherently expensive
+
+2. **The instrumentation itself is cheap**
+   - Function patching/wrapping adds minimal overhead
+   - Most time is spent in span processing, not in the hooks
+
+3. **Sampling reduces overhead proportionally**
+   - At 10% sampling, most requests skip span serialization entirely
+   - This explains why lower sampling rates dramatically improve performance
+
+### Optimization Opportunities
+
+Based on the profile data:
+
+1. Lazy serialization - Defer protobuf conversion until export time
+2. Batch serialization - Serialize multiple spans together
+3. Schema caching - Cache JSON schema conversions
+4. Attribute filtering - Skip serializing large/unnecessary attributes
+
+## Custom Profiling
+
+### Profile a Specific Workload
+
+Edit `benchmarks/profile/simple_profile.py` to customize:
+
+```python
+# Adjust number of iterations
+iterations = 1000
+
+# Change request mix
+if i % 3 == 0:
+    # Your custom endpoint
+    response = session.get(f"{server_url}/your-endpoint")
+```
+
+### Profile with Different SDK Settings
+
+```python
+# In simple_profile.py, modify SDK initialization:
+sdk = TuskDrift.initialize(
+    api_key="profile-test-key",
+    env="profile",
+    sampling_rate=0.1,  # Test with different sampling rates
+    transforms={...},    # Test with transforms enabled
+    log_level="warning",
+)
+```
+
+### Profile Production Code
+
+You can use py-spy to attach to a running process:
+
+```bash
+# Find your Python process PID
+ps aux | grep python
+
+# Attach and record
+sudo py-spy record -o profile.svg --pid <PID> --duration 30
+```
+
+## Comparing Before/After Changes
+
+1. Run profile before changes:
+
+   ```bash
+   ./benchmarks/profile/profile.sh cprofile
+   mv benchmarks/profile/results/cprofile_*.prof benchmarks/profile/results/before.prof
+   ```
+
+2. Make your changes
+
+3. Run profile after changes:
+
+   ```bash
+   ./benchmarks/profile/profile.sh cprofile
+   mv benchmarks/profile/results/cprofile_*.prof benchmarks/profile/results/after.prof
+   ```
+
+4. Compare with pstats:
+
+   ```python
+   import pstats
+
+   before = pstats.Stats('benchmarks/profile/results/before.prof')
+   after = pstats.Stats('benchmarks/profile/results/after.prof')
+
+   print("=== BEFORE ===")
+   before.strip_dirs().sort_stats('cumulative').print_stats(20)
+
+   print("=== AFTER ===")
+   after.strip_dirs().sort_stats('cumulative').print_stats(20)
+   ```
+
+## Output Files
+
+Profile results are saved to `benchmarks/profile/results/` (gitignored):
+
+| File | Description |
+|------|-------------|
+| `cprofile_*.prof` | cProfile binary data |
+| `flamegraph_*.svg` | py-spy flame graph |
+| `scalene_*.html` | Scalene HTML report |
+| `viztracer_*.json` | VizTracer timeline data |
+| `traces/` | SDK trace output during profiling |
+
+## Tips
+
+- **Start with cProfile** - It's built-in and gives good overview
+- **Use snakeviz for exploration** - Interactive visualization helps find hotspots
+- **Profile realistic workloads** - Micro-benchmarks may not reflect production patterns
+- **Compare sampling rates** - Profile with 100% vs 10% sampling to see the difference
+- **Watch for I/O** - File writes and network calls can dominate profiles
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000..594982c
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,161 @@
+# Benchmarks
+
+These benchmarks measure the performance overhead of the Drift Python SDK.
+
+## Overview
+
+The benchmark suite runs a Flask test server and makes HTTP requests to various endpoints
+while measuring latency, throughput, CPU usage, and memory consumption. Three configurations
+are tested:
+
+1. SDK Disabled (baseline) - No SDK instrumentation
+2. SDK Active - SDK in RECORD mode, capturing traces
+3. SDK Active with Transforms - SDK with data transformation rules enabled
+
+## Usage
+
+### Prerequisites
+
+Make sure you have the required dependencies:
+
+```bash
+cd /path/to/drift-python-sdk
+uv pip install psutil flask requests
+```
+
+### Running All Benchmarks
+
+```bash
+# Run all benchmarks and compare results
+./run_benchmarks.sh
+```
+
+### Running Individual Benchmarks
+
+```bash
+# Realistic workload benchmark (recommended)
+python benchmarks/bench/realistic_workload.py
+
+# Fixed QPS latency test (measures latency at controlled request rates)
+python benchmarks/bench/fixed_qps_latency.py
+
+# Synthetic benchmarks (stress tests)
+python benchmarks/bench/sdk_disabled.py
+python benchmarks/bench/sdk_active.py
+python benchmarks/bench/sdk_active_with_transforms.py
+
+# Sampling rate comparison
+python benchmarks/bench/sdk_sampling_rates.py
+```
+
+### Comparing Results
+
+After running benchmarks, compare the results:
+
+```bash
+python benchmarks/compare_benchmarks.py
+```
+
+### Configuration
+
+You can configure benchmarks via environment variables:
+
+- `BENCHMARK_ENABLE_MEMORY=false` - Disable memory monitoring (reduces CPU overhead)
+
+Or modify the options in `common.py`:
+
+```python
+DEFAULT_OPTIONS = {
+    "time_per_task_ms": 10_000,  # Duration per task (10 seconds default)
+    "warmup_iterations": 5,      # Warmup iterations before measurement
+    "enable_memory_tracking": True,
+}
+```
+
+## Benchmark Tasks
+
+### Realistic Workloads (Recommended)
+
+These endpoints simulate production API behavior:
+
+- **GET /api/typical-read** (~5-10ms): Auth check + DB read + response serialization
+- **POST /api/typical-write** (~15-25ms): Validation + DB write + response
+- **POST /api/realistic** (~10-20ms): Validation + DB query + data processing + response
+
+### Synthetic Workloads
+
+These are stress tests, not representative of production:
+
+- **POST /api/compute-hash**: Pure CPU (iterative SHA-256) - useful for profiling
+- **POST /api/io-bound**: Pure I/O (sleep delays) - tests baseline overhead
+- **POST /api/auth/login, /api/users**: Sensitive data for transform testing
+
+## Output
+
+Results are saved to `benchmarks/results/` as JSON files:
+
+- `sdk-disabled.json` - Baseline results
+- `sdk-active.json` - SDK enabled results
+- `sdk-active-with-transforms.json` - SDK with transforms results
+
+The comparison script outputs a markdown table showing:
+
+- Throughput delta (negative = worse)
+- Tail latency (p99) delta (positive = worse)
+- CPU usage delta
+- Memory overhead
+
+## Interpreting Results
+
+- **Throughput $\Delta$**: Percentage change in operations/second. Negative means slower.
+- **Tail Latency $\Delta$**: Percentage change in p99 latency. Positive means slower.
+- **CPU User $\Delta$**: Change in user-space CPU percentage.
+- **Memory $\Delta$**: Additional memory used by the SDK.
+
+Ideally, the SDK should have minimal impact:
+
+- Throughput should be within ±5%
+- Tail latency increase should be <10%
+- Memory overhead should be reasonable (<50MB)
+
+## Profiling
+
+For detailed profiling to understand where SDK overhead comes from, see **[PROFILING.md](./PROFILING.md)**.
+
+Quick start:
+
+```bash
+# Run cProfile analysis
+./benchmarks/profile/profile.sh cprofile
+
+# View interactively
+pip install snakeviz
+snakeviz benchmarks/profile/results/cprofile_*.prof
+```
+
+## Architecture
+
+```text
+benchmarks/
+├── bench/
+│   ├── common.py              # Shared benchmark logic
+│   ├── fixed_qps_latency.py   # Fixed QPS latency test
+│   ├── realistic_workload.py  # Realistic API workload benchmark
+│   ├── resource_monitor.py    # CPU/memory monitoring
+│   ├── result_utils.py        # Result serialization
+│   ├── sdk_disabled.py        # Baseline benchmark (synthetic)
+│   ├── sdk_active.py          # SDK active benchmark (synthetic)
+│   ├── sdk_active_with_transforms.py
+│   └── sdk_sampling_rates.py  # Sampling rate impact benchmark
+├── profile/
+│   ├── profile.sh             # Profiler runner script
+│   ├── simple_profile.py      # Profiling workload
+│   └── results/               # Profile output (gitignored)
+├── server/
+│   └── test_server.py         # Flask test server
+├── results/                   # JSON output (gitignored)
+├── compare_benchmarks.py      # Result comparison script
+├── run_benchmarks.sh          # Runner script
+├── PROFILING.md               # Profiling documentation
+└── README.md
+```
diff --git a/benchmarks/bench/__init__.py b/benchmarks/bench/__init__.py
new file mode 100644
index 0000000..854b623
--- /dev/null
+++ b/benchmarks/bench/__init__.py
@@ -0,0 +1 @@
+# Benchmark utilities
diff --git a/benchmarks/bench/common.py b/benchmarks/bench/common.py
new file mode 100644
index 0000000..d68a156
--- /dev/null
+++ b/benchmarks/bench/common.py
@@ -0,0 +1,227 @@
+"""Common benchmark runner logic."""
+
+from __future__ import annotations
+
+import os
+import sys
+import time
+from collections.abc import Callable
+from typing import Any
+
+import requests
+
+# Add parent to path for imports
+sys.path.insert(0, str(__file__).rsplit("/", 3)[0])
+
+from benchmarks.bench.resource_monitor import ResourceMonitor
+from benchmarks.bench.result_utils import (
+    create_benchmark_result,
+    create_task_result,
+    persist_result,
+)
+from benchmarks.server.test_server import TestServer
+
+# Default benchmark options
+DEFAULT_OPTIONS = {
+    "time_per_task_ms": 10_000,  # Run each task for 10 seconds
+    "warmup_iterations": 5,
+    "enable_memory_tracking": True,
+}
+
+
+class BenchmarkTask:
+    """A single benchmark task."""
+
+    def __init__(
+        self,
+        name: str,
+        func: Callable[[], Any],
+    ):
+        self.name = name
+        self.func = func
+        self.samples_ns: list[float] = []
+
+
+def run_benchmarks(
+    label: str = "benchmark",
+    options: dict[str, Any] | None = None,
+) -> None:
+    """Run all benchmarks with the given label."""
+    opts = {**DEFAULT_OPTIONS, **(options or {})}
+
+    enable_memory_tracking = os.environ.get("BENCHMARK_ENABLE_MEMORY", "true").lower() != "false"
+
+    # Start test server
+    server = TestServer()
+    server_info = server.start()
+    server_url = server_info["url"]
+
+    # Wait for server to be ready
+    _wait_for_server(server_url)
+
+    # Initialize resource monitor
+    resource_monitor = ResourceMonitor(
+        interval_ms=100,
+        enable_memory_tracking=enable_memory_tracking,
+    )
+
+    # Define benchmark tasks
+    tasks = _create_tasks(server_url, label)
+
+    print(f"\n{'=' * 60}")
+    print(f"Running benchmarks: {label}")
+    print(f"{'=' * 60}\n")
+
+    benchmark_start = time.time()
+    resource_monitor.start()
+
+    task_results = []
+    for task in tasks:
+        print(f"Running: {task.name}")
+        resource_monitor.start_task(task.name)
+
+        # Warmup
+        for _ in range(opts["warmup_iterations"]):
+            task.func()
+
+        # Actual benchmark
+        task_start = time.time()
+        while (time.time() - task_start) * 1000 < opts["time_per_task_ms"]:
+            iter_start = time.perf_counter_ns()
+            task.func()
+            iter_end = time.perf_counter_ns()
+            task.samples_ns.append(iter_end - iter_start)
+
+        resource_monitor.end_task()
+
+        # Create result for this task
+        task_result = create_task_result(task.name, task.samples_ns, resource_monitor)
+        task_results.append(task_result)
+
+        # Print summary
+        if task_result.latency.mean:
+            mean_ms = task_result.latency.mean / 1_000_000
+            print(f"  Samples: {task_result.samples}, Mean: {mean_ms:.2f}ms")
+            if task_result.throughput.mean:
+                print(f"  Throughput: {task_result.throughput.mean:.1f} ops/s")
+        print()
+
+    resource_monitor.stop()
+    benchmark_duration_ms = (time.time() - benchmark_start) * 1000
+
+    # Create and save result
+    result = create_benchmark_result(
+        label=label,
+        tasks=task_results,
+        duration_ms=benchmark_duration_ms,
+        options=opts,
+    )
+
+    output_path = persist_result(result)
+    print(f"\n{'=' * 60}")
+    print("Benchmark complete!")
+    print(f"Results saved to: {output_path}")
+    print(f"Total duration: {benchmark_duration_ms / 1000:.1f}s")
+    print(f"{'=' * 60}\n")
+
+    # Stop server
+    server.stop()
+
+
+def _wait_for_server(url: str, timeout: float = 10.0) -> None:
+    """Wait for the server to be ready."""
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            response = requests.get(f"{url}/health", timeout=1)
+            if response.status_code == 200:
+                return
+        except requests.RequestException:
+            pass
+        time.sleep(0.1)
+    raise RuntimeError(f"Server at {url} did not become ready in {timeout}s")
+
+
+def _create_tasks(server_url: str, test_name: str) -> list[BenchmarkTask]:
+    """Create benchmark tasks."""
+    tasks = []
+
+    # Use a session for connection pooling to avoid port exhaustion
+    session = requests.Session()
+
+    # Task 1: High CPU (compute hash)
+    def cpu_bound():
+        response = session.post(
+            f"{server_url}/api/compute-hash",
+            json={"data": "sensitive-data-to-hash", "iterations": 1000},
+            timeout=30,
+        )
+        response.raise_for_status()
+        return response.json()
+
+    tasks.append(
+        BenchmarkTask(
+            name=f"High CPU: POST /api/compute-hash ({test_name})",
+            func=cpu_bound,
+        )
+    )
+
+    # Task 2: High IO, Low CPU
+    def io_bound():
+        response = session.post(
+            f"{server_url}/api/io-bound",
+            json={"jobs": 5, "delayMs": 5},
+            timeout=30,
+        )
+        response.raise_for_status()
+        return response.json()
+
+    tasks.append(
+        BenchmarkTask(
+            name=f"High IO, Low CPU: POST /api/io-bound ({test_name})",
+            func=io_bound,
+        )
+    )
+
+    # Task 3: Transform endpoints (cycling through sensitive data endpoints)
+    transform_endpoints = [
+        {
+            "path": "/api/auth/login",
+            "body": {"email": "user@example.com", "password": "super-secret-password-123"},
+        },
+        {
+            "path": "/api/users",
+            "body": {
+                "username": "testuser",
+                "email": "test@example.com",
+                "ssn": "123-45-6789",
+                "creditCard": "4111-1111-1111-1111",
+            },
+        },
+    ]
+    endpoint_index = [0]  # Use list to allow mutation in closure
+
+    def transform_task():
+        endpoint = transform_endpoints[endpoint_index[0] % len(transform_endpoints)]
+        endpoint_index[0] += 1
+        response = session.post(
+            f"{server_url}{endpoint['path']}",
+            json=endpoint["body"],
+            timeout=30,
+        )
+        response.raise_for_status()
+        return response.json()
+
+    tasks.append(
+        BenchmarkTask(
+            name=f"Transform endpoints ({test_name})",
+            func=transform_task,
+        )
+    )
+
+    return tasks
+
+
+if __name__ == "__main__":
+    # Quick test run
+    run_benchmarks(label="test-run", options={"time_per_task_ms": 3000})
diff --git a/benchmarks/bench/fixed_qps_latency.py b/benchmarks/bench/fixed_qps_latency.py
new file mode 100644
index 0000000..2be9dda
--- /dev/null
+++ b/benchmarks/bench/fixed_qps_latency.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""Benchmark: Fixed QPS Latency Test.
+
+This benchmark measures latency at fixed request rates (QPS) to show
+how the SDK affects response times under controlled load.
+
+Unlike throughput tests that blast requests as fast as possible, this
+sends requests at a steady rate to measure realistic latency impact.
+"""
+
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run_fixed_qps_benchmark(
+    mode: str, target_qps: int, duration_seconds: int = 10, sampling_rate: float = 1.0
+) -> dict | None:
+    """Run benchmark at a fixed QPS in a subprocess."""
+
+    script = f'''
+import os
+import sys
+import time
+import shutil
+import statistics
+import threading
+from pathlib import Path
+
+os.environ["TUSK_DRIFT_MODE"] = "{mode}"
+sys.path.insert(0, "{os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))}")
+
+import requests
+
+TRACE_DIR = Path("{Path(__file__).parent.parent / ".benchmark-traces-qps"}")
+if TRACE_DIR.exists():
+    shutil.rmtree(TRACE_DIR)
+
+if "{mode}" == "RECORD":
+    from drift import TuskDrift
+    from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter
+
+    sdk = TuskDrift.initialize(
+        api_key="benchmark-test-key",
+        env="benchmark",
+        sampling_rate={sampling_rate},
+        log_level="warn",
+    )
+    if sdk.span_exporter:
+        sdk.span_exporter.clear_adapters()
+        adapter = FilesystemSpanAdapter(base_directory=TRACE_DIR)
+        sdk.span_exporter.add_adapter(adapter)
+    sdk.mark_app_as_ready()
+
+from benchmarks.server.test_server import TestServer
+
+# Start server
+server = TestServer()
+server_info = server.start()
+server_url = server_info["url"]
+time.sleep(0.5)
+
+session = requests.Session()
+
+# Warmup
+for _ in range(20):
+    session.post(f"{{server_url}}/api/realistic", json={{"userId": "warmup", "query": "test"}})
+
+# Fixed QPS parameters
+target_qps = {target_qps}
+duration_seconds = {duration_seconds}
+interval = 1.0 / target_qps  # Time between requests
+
+latencies = []
+start_time = time.perf_counter()
+request_count = 0
+target_requests = target_qps * duration_seconds
+
+while request_count < target_requests:
+    # Calculate when this request should start
+    expected_start = start_time + (request_count * interval)
+    now = time.perf_counter()
+
+    # Wait if we're ahead of schedule
+    if now < expected_start:
+        time.sleep(expected_start - now)
+
+    # Make request and measure latency
+    req_start = time.perf_counter_ns()
+    response = session.post(
+        f"{{server_url}}/api/realistic",
+        json={{"userId": f"user-{{request_count}}", "query": "search query", "email": "test@example.com"}},
+    )
+    response.json()
+    latencies.append(time.perf_counter_ns() - req_start)
+    request_count += 1
+
+elapsed = time.perf_counter() - start_time
+actual_qps = request_count / elapsed
+
+# Calculate statistics
+sorted_latencies = sorted(latencies)
+results = {{
+    "target_qps": target_qps,
+    "actual_qps": round(actual_qps, 1),
+    "total_requests": request_count,
+    "duration_seconds": round(elapsed, 2),
+    "mean_ms": statistics.mean(latencies) / 1_000_000,
+    "p50_ms": sorted_latencies[len(sorted_latencies) // 2] / 1_000_000,
+    "p90_ms": sorted_latencies[int(len(sorted_latencies) * 0.90)] / 1_000_000,
+    "p99_ms": sorted_latencies[int(len(sorted_latencies) * 0.99)] / 1_000_000,
+    "min_ms": min(latencies) / 1_000_000,
+    "max_ms": max(latencies) / 1_000_000,
+}}
+
+session.close()
+server.stop()
+
+if "{mode}" == "RECORD":
+    sdk.shutdown()
+
+import json
+print("RESULTS_JSON:" + json.dumps(results))
+'''
+
+    result = subprocess.run(
+        [sys.executable, "-c", script],
+        capture_output=True,
+        text=True,
+        cwd=os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+        timeout=180,
+    )
+
+    if result.returncode != 0:
+        print(f"Error running benchmark ({mode}, {target_qps} QPS):")
+        print(result.stderr[-1000:])
+        return None
+
+    for line in result.stdout.split("\n"):
+        if line.startswith("RESULTS_JSON:"):
+            return json.loads(line.replace("RESULTS_JSON:", ""))
+
+    print("Could not find results in output")
+    return None
+
+
+def main():
+    print("=" * 70)
+    print("Fixed QPS Latency Benchmark")
+    print("=" * 70)
+    print()
+    print("This test sends requests at a fixed rate to measure latency impact.")
+    print("Endpoint: POST /api/realistic (~10-15ms baseline)")
+    print()
+
+    # Test at multiple QPS levels
+    qps_levels = [25, 50, 75]  # Requests per second
+    duration = 10  # seconds per test
+
+    results = {"baseline": {}, "sdk_100": {}, "sdk_10": {}}
+
+    for qps in qps_levels:
+        print(f"Testing at {qps} QPS...")
+
+        # Baseline
+        print("  Running baseline (SDK disabled)...")
+        baseline = run_fixed_qps_benchmark("DISABLED", qps, duration)
+        if baseline:
+            results["baseline"][qps] = baseline
+            print(f"    Mean: {baseline['mean_ms']:.1f}ms, p99: {baseline['p99_ms']:.1f}ms")
+
+        # SDK 100%
+        print("  Running SDK (100% sampling)...")
+        sdk_100 = run_fixed_qps_benchmark("RECORD", qps, duration, sampling_rate=1.0)
+        if sdk_100:
+            results["sdk_100"][qps] = sdk_100
+            print(f"    Mean: {sdk_100['mean_ms']:.1f}ms, p99: {sdk_100['p99_ms']:.1f}ms")
+
+        # SDK 10%
+        print("  Running SDK (10% sampling)...")
+        sdk_10 = run_fixed_qps_benchmark("RECORD", qps, duration, sampling_rate=0.1)
+        if sdk_10:
+            results["sdk_10"][qps] = sdk_10
+            print(f"    Mean: {sdk_10['mean_ms']:.1f}ms, p99: {sdk_10['p99_ms']:.1f}ms")
+
+        print()
+
+    # Print summary table
+    print("=" * 70)
+    print("Results Summary: Latency at Fixed QPS")
+    print("=" * 70)
+    print()
+
+    print("### Mean Latency (ms)")
+    print()
+    print("| QPS | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |")
+    print("|-----|----------|------------|----------|-----------|----------|")
+
+    for qps in qps_levels:
+        b = results["baseline"].get(qps, {})
+        s100 = results["sdk_100"].get(qps, {})
+        s10 = results["sdk_10"].get(qps, {})
+
+        if b and s100 and s10:
+            b_mean = b["mean_ms"]
+            s100_mean = s100["mean_ms"]
+            s10_mean = s10["mean_ms"]
+
+            overhead_100 = s100_mean - b_mean
+            overhead_10 = s10_mean - b_mean
+            pct_100 = (overhead_100 / b_mean) * 100 if b_mean > 0 else 0
+            pct_10 = (overhead_10 / b_mean) * 100 if b_mean > 0 else 0
+
+            print(
+                f"| {qps} | {b_mean:.1f}ms | {s100_mean:.1f}ms | +{overhead_100:.1f}ms ({pct_100:+.0f}%) | {s10_mean:.1f}ms | +{overhead_10:.1f}ms ({pct_10:+.0f}%) |"
+            )
+
+    print()
+    print("### P99 Latency (ms)")
+    print()
+    print("| QPS | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |")
+    print("|-----|----------|------------|----------|-----------|----------|")
+
+    for qps in qps_levels:
+        b = results["baseline"].get(qps, {})
+        s100 = results["sdk_100"].get(qps, {})
+        s10 = results["sdk_10"].get(qps, {})
+
+        if b and s100 and s10:
+            b_p99 = b["p99_ms"]
+            s100_p99 = s100["p99_ms"]
+            s10_p99 = s10["p99_ms"]
+
+            overhead_100 = s100_p99 - b_p99
+            overhead_10 = s10_p99 - b_p99
+            pct_100 = (overhead_100 / b_p99) * 100 if b_p99 > 0 else 0
+            pct_10 = (overhead_10 / b_p99) * 100 if b_p99 > 0 else 0
+
+            print(
+                f"| {qps} | {b_p99:.1f}ms | {s100_p99:.1f}ms | +{overhead_100:.1f}ms ({pct_100:+.0f}%) | {s10_p99:.1f}ms | +{overhead_10:.1f}ms ({pct_10:+.0f}%) |"
+            )
+
+    print()
+
+    # Save results
+    results_dir = Path(__file__).parent.parent / "results"
+    results_dir.mkdir(exist_ok=True)
+
+    with open(results_dir / "fixed-qps-latency.json", "w") as f:
+        json.dump(results, f, indent=2)
+
+    print(f"Results saved to {results_dir / 'fixed-qps-latency.json'}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/bench/realistic_workload.py b/benchmarks/bench/realistic_workload.py
new file mode 100644
index 0000000..77b3c0e
--- /dev/null
+++ b/benchmarks/bench/realistic_workload.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python3
+"""Benchmark: Realistic workload comparison.
+
+This benchmark tests SDK overhead on realistic API workloads:
+1. Typical read endpoint (~5-10ms baseline)
+2. Typical write endpoint (~15-25ms baseline)
+3. Realistic mixed workload (~10-20ms baseline)
+
+These simulate production APIs that do actual work (I/O, validation, processing).
+"""
+
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run_benchmark_subprocess(mode: str, sampling_rate: float = 1.0) -> dict | None:
+    """Run benchmark in a subprocess to ensure clean SDK state."""
+
+    script = f'''
+import os
+import sys
+import time
+import shutil
+import statistics
+from pathlib import Path
+
+# Configure SDK mode
+os.environ["TUSK_DRIFT_MODE"] = "{mode}"
+sys.path.insert(0, "{os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))}")
+
+import requests
+
+TRACE_DIR = Path("{Path(__file__).parent.parent / ".benchmark-traces-realistic"}")
+if TRACE_DIR.exists():
+    shutil.rmtree(TRACE_DIR)
+
+if "{mode}" == "RECORD":
+    from drift import TuskDrift
+    from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter
+
+    sdk = TuskDrift.initialize(
+        api_key="benchmark-test-key",
+        env="benchmark",
+        sampling_rate={sampling_rate},
+        log_level="warn",
+    )
+    if sdk.span_exporter:
+        sdk.span_exporter.clear_adapters()
+        adapter = FilesystemSpanAdapter(base_directory=TRACE_DIR)
+        sdk.span_exporter.add_adapter(adapter)
+    sdk.mark_app_as_ready()
+
+from benchmarks.server.test_server import TestServer
+
+# Start server
+server = TestServer()
+server_info = server.start()
+server_url = server_info["url"]
+
+# Wait for server
+time.sleep(0.5)
+session = requests.Session()
+
+# Warmup
+for _ in range(10):
+    session.get(f"{{server_url}}/api/typical-read")
+    session.post(f"{{server_url}}/api/typical-write", json={{"name": "test"}})
+    session.post(f"{{server_url}}/api/realistic", json={{"userId": "u1", "query": "test"}})
+
+# Benchmark parameters
+iterations = 200
+results = {{}}
+
+# Test 1: Typical Read (~5-10ms baseline)
+latencies = []
+for _ in range(iterations):
+    start = time.perf_counter_ns()
+    response = session.get(f"{{server_url}}/api/typical-read")
+    response.json()
+    latencies.append(time.perf_counter_ns() - start)
+
+results["typical_read"] = {{
+    "mean_ms": statistics.mean(latencies) / 1_000_000,
+    "p50_ms": statistics.median(latencies) / 1_000_000,
+    "p99_ms": sorted(latencies)[int(len(latencies) * 0.99)] / 1_000_000,
+    "samples": len(latencies),
+}}
+
+# Test 2: Typical Write (~15-25ms baseline)
+latencies = []
+for i in range(iterations):
+    start = time.perf_counter_ns()
+    response = session.post(f"{{server_url}}/api/typical-write", json={{"name": f"item-{{i}}"}})
+    response.json()
+    latencies.append(time.perf_counter_ns() - start)
+
+results["typical_write"] = {{
+    "mean_ms": statistics.mean(latencies) / 1_000_000,
+    "p50_ms": statistics.median(latencies) / 1_000_000,
+    "p99_ms": sorted(latencies)[int(len(latencies) * 0.99)] / 1_000_000,
+    "samples": len(latencies),
+}}
+
+# Test 3: Realistic Mixed (~10-20ms baseline)
+latencies = []
+for i in range(iterations):
+    start = time.perf_counter_ns()
+    response = session.post(
+        f"{{server_url}}/api/realistic",
+        json={{"userId": f"user-{{i}}", "query": "search query", "email": "test@example.com"}},
+    )
+    response.json()
+    latencies.append(time.perf_counter_ns() - start)
+
+results["realistic_mixed"] = {{
+    "mean_ms": statistics.mean(latencies) / 1_000_000,
+    "p50_ms": statistics.median(latencies) / 1_000_000,
+    "p99_ms": sorted(latencies)[int(len(latencies) * 0.99)] / 1_000_000,
+    "samples": len(latencies),
+}}
+
+# Cleanup
+session.close()
+server.stop()
+
+if "{mode}" == "RECORD":
+    sdk.shutdown()
+
+import json
+print("RESULTS_JSON:" + json.dumps(results))
+'''
+
+    result = subprocess.run(
+        [sys.executable, "-c", script],
+        capture_output=True,
+        text=True,
+        cwd=os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+        timeout=120,
+    )
+
+    if result.returncode != 0:
+        print(f"Error running benchmark ({mode}):")
+        print(result.stderr)
+        return None
+
+    # Extract results from output
+    for line in result.stdout.split("\n"):
+        if line.startswith("RESULTS_JSON:"):
+            return json.loads(line.replace("RESULTS_JSON:", ""))
+
+    print(f"Could not find results in output for {mode}")
+    print(result.stdout[-500:])
+    return None
+
+
+def format_comparison(baseline: dict, sdk_on: dict) -> dict:
+    """Calculate overhead percentages."""
+    comparison = {}
+    for key in baseline:
+        b_mean = baseline[key]["mean_ms"]
+        s_mean = sdk_on[key]["mean_ms"]
+        b_p99 = baseline[key]["p99_ms"]
+        s_p99 = sdk_on[key]["p99_ms"]
+
+        comparison[key] = {
+            "baseline_mean_ms": b_mean,
+            "sdk_mean_ms": s_mean,
+            "mean_overhead_ms": s_mean - b_mean,
+            "mean_overhead_pct": ((s_mean - b_mean) / b_mean) * 100 if b_mean > 0 else 0,
+            "baseline_p99_ms": b_p99,
+            "sdk_p99_ms": s_p99,
+            "p99_overhead_ms": s_p99 - b_p99,
+            "p99_overhead_pct": ((s_p99 - b_p99) / b_p99) * 100 if b_p99 > 0 else 0,
+        }
+    return comparison
+
+
+def main():
+    print("=" * 60)
+    print("Realistic Workload Benchmark")
+    print("=" * 60)
+    print()
+
+    # Run baseline (SDK disabled)
+    print("Running baseline (SDK disabled)...")
+    baseline = run_benchmark_subprocess("DISABLED")
+    if not baseline:
+        print("Failed to run baseline benchmark")
+        return
+
+    print(
+        f"  Typical Read: {baseline['typical_read']['mean_ms']:.2f}ms mean, {baseline['typical_read']['p99_ms']:.2f}ms p99"
+    )
+    print(
+        f"  Typical Write: {baseline['typical_write']['mean_ms']:.2f}ms mean, {baseline['typical_write']['p99_ms']:.2f}ms p99"
+    )
+    print(
+        f"  Realistic Mixed: {baseline['realistic_mixed']['mean_ms']:.2f}ms mean, {baseline['realistic_mixed']['p99_ms']:.2f}ms p99"
+    )
+    print()
+
+    # Run with SDK (100% sampling)
+    print("Running with SDK (100% sampling)...")
+    sdk_100 = run_benchmark_subprocess("RECORD", sampling_rate=1.0)
+    if not sdk_100:
+        print("Failed to run SDK benchmark")
+        return
+
+    print(
+        f"  Typical Read: {sdk_100['typical_read']['mean_ms']:.2f}ms mean, {sdk_100['typical_read']['p99_ms']:.2f}ms p99"
+    )
+    print(
+        f"  Typical Write: {sdk_100['typical_write']['mean_ms']:.2f}ms mean, {sdk_100['typical_write']['p99_ms']:.2f}ms p99"
+    )
+    print(
+        f"  Realistic Mixed: {sdk_100['realistic_mixed']['mean_ms']:.2f}ms mean, {sdk_100['realistic_mixed']['p99_ms']:.2f}ms p99"
+    )
+    print()
+
+    # Run with SDK (10% sampling)
+    print("Running with SDK (10% sampling)...")
+    sdk_10 = run_benchmark_subprocess("RECORD", sampling_rate=0.1)
+    if not sdk_10:
+        print("Failed to run SDK 10% benchmark")
+        return
+
+    print(
+        f"  Typical Read: {sdk_10['typical_read']['mean_ms']:.2f}ms mean, {sdk_10['typical_read']['p99_ms']:.2f}ms p99"
+    )
+    print(
+        f"  Typical Write: {sdk_10['typical_write']['mean_ms']:.2f}ms mean, {sdk_10['typical_write']['p99_ms']:.2f}ms p99"
+    )
+    print(
+        f"  Realistic Mixed: {sdk_10['realistic_mixed']['mean_ms']:.2f}ms mean, {sdk_10['realistic_mixed']['p99_ms']:.2f}ms p99"
+    )
+    print()
+
+    # Calculate and display comparisons
+    comparison_100 = format_comparison(baseline, sdk_100)
+    comparison_10 = format_comparison(baseline, sdk_10)
+
+    print("=" * 60)
+    print("Results Summary")
+    print("=" * 60)
+    print()
+
+    print("## SDK Overhead on Realistic Workloads")
+    print()
+    print("| Endpoint | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |")
+    print("|----------|----------|------------|----------|-----------|----------|")
+
+    for key, label in [
+        ("typical_read", "Read API (~5ms)"),
+        ("typical_write", "Write API (~15ms)"),
+        ("realistic_mixed", "Mixed API (~10ms)"),
+    ]:
+        c100 = comparison_100[key]
+        c10 = comparison_10[key]
+        print(
+            f"| {label} | {c100['baseline_mean_ms']:.1f}ms | {c100['sdk_mean_ms']:.1f}ms | +{c100['mean_overhead_ms']:.1f}ms ({c100['mean_overhead_pct']:+.1f}%) | {c10['sdk_mean_ms']:.1f}ms | +{c10['mean_overhead_ms']:.1f}ms ({c10['mean_overhead_pct']:+.1f}%) |"
+        )
+
+    print()
+    print("## P99 Latency Impact")
+    print()
+    print("| Endpoint | Baseline p99 | SDK (100%) p99 | SDK (10%) p99 |")
+    print("|----------|--------------|----------------|---------------|")
+
+    for key, label in [("typical_read", "Read API"), ("typical_write", "Write API"), ("realistic_mixed", "Mixed API")]:
+        c100 = comparison_100[key]
+        c10 = comparison_10[key]
+        print(
+            f"| {label} | {c100['baseline_p99_ms']:.1f}ms | {c100['sdk_p99_ms']:.1f}ms (+{c100['p99_overhead_pct']:.1f}%) | {c10['sdk_p99_ms']:.1f}ms (+{c10['p99_overhead_pct']:.1f}%) |"
+        )
+
+    print()
+
+    # Save results
+    results_dir = Path(__file__).parent.parent / "results"
+    results_dir.mkdir(exist_ok=True)
+
+    results = {
+        "baseline": baseline,
+        "sdk_100_percent": sdk_100,
+        "sdk_10_percent": sdk_10,
+        "comparison_100": comparison_100,
+        "comparison_10": comparison_10,
+    }
+
+    with open(results_dir / "realistic-workload.json", "w") as f:
+        json.dump(results, f, indent=2)
+
+    print(f"Results saved to {results_dir / 'realistic-workload.json'}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/bench/resource_monitor.py b/benchmarks/bench/resource_monitor.py
new file mode 100644
index 0000000..9174e16
--- /dev/null
+++ b/benchmarks/bench/resource_monitor.py
@@ -0,0 +1,202 @@
+"""Resource monitoring for benchmarks - tracks CPU and memory usage."""
+
+from __future__ import annotations
+
+import resource
+import threading
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class CpuStats:
+    """CPU usage statistics."""
+
+    user_percent: float = 0.0
+    system_percent: float = 0.0
+    total_percent: float = 0.0
+
+
+@dataclass
+class MemoryStats:
+    """Memory usage statistics."""
+
+    rss_avg: float = 0.0
+    rss_max: float = 0.0
+
+
+@dataclass
+class TaskResourceStats:
+    """Resource statistics for a single task."""
+
+    cpu: CpuStats = field(default_factory=CpuStats)
+    memory: MemoryStats = field(default_factory=MemoryStats)
+
+
+@dataclass
+class TaskStats:
+    """Internal tracking for a task."""
+
+    rss_sum: float = 0.0
+    rss_max: float = 0.0
+    start_user_time: float = 0.0
+    start_system_time: float = 0.0
+    start_time: float = 0.0
+    sample_count: int = 0
+
+
+class ResourceMonitor:
+    """Monitor CPU and memory usage during benchmark tasks."""
+
+    def __init__(self, interval_ms: int = 100, enable_memory_tracking: bool = True):
+        self.interval_ms = interval_ms
+        self.enable_memory_tracking = enable_memory_tracking
+
+        self._task_stats: dict[str, TaskStats] = {}
+        self._completed_stats: dict[str, TaskResourceStats] = {}
+        self._current_task_name: str | None = None
+        self._current_task_stats: TaskStats | None = None
+        self._is_running = False
+        self._monitor_thread: threading.Thread | None = None
+        self._stop_event = threading.Event()
+
+    def start(self) -> None:
+        """Start the resource monitor."""
+        self._is_running = True
+        self._stop_event.clear()
+
+        if self.enable_memory_tracking:
+            self._monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
+            self._monitor_thread.start()
+
+    def stop(self) -> None:
+        """Stop the resource monitor."""
+        self._is_running = False
+        self._stop_event.set()
+        if self._monitor_thread:
+            self._monitor_thread.join(timeout=1)
+            self._monitor_thread = None
+
+    def start_task(self, task_name: str) -> None:
+        """Start monitoring a new task."""
+        self._current_task_name = task_name
+
+        # Get current CPU times
+        usage = resource.getrusage(resource.RUSAGE_SELF)
+
+        task_stats = TaskStats(
+            start_user_time=usage.ru_utime,
+            start_system_time=usage.ru_stime,
+            start_time=time.time(),
+        )
+        self._task_stats[task_name] = task_stats
+        self._current_task_stats = task_stats
+
+    def end_task(self) -> None:
+        """End monitoring the current task."""
+        if not self._current_task_name or not self._current_task_stats:
+            return
+
+        end_time = time.time()
+        total_elapsed = end_time - self._current_task_stats.start_time
+
+        # Get final CPU times
+        usage = resource.getrusage(resource.RUSAGE_SELF)
+        user_time = usage.ru_utime - self._current_task_stats.start_user_time
+        system_time = usage.ru_stime - self._current_task_stats.start_system_time
+
+        # Calculate CPU percentages
+        if total_elapsed > 0:
+            user_percent = (user_time / total_elapsed) * 100
+            system_percent = (system_time / total_elapsed) * 100
+        else:
+            user_percent = 0.0
+            system_percent = 0.0
+
+        total_percent = user_percent + system_percent
+
+        # Calculate memory stats
+        if self._current_task_stats.sample_count > 0:
+            rss_avg = self._current_task_stats.rss_sum / self._current_task_stats.sample_count
+            rss_max = self._current_task_stats.rss_max
+        else:
+            rss_avg = 0.0
+            rss_max = 0.0
+
+        resource_stats = TaskResourceStats(
+            cpu=CpuStats(
+                user_percent=user_percent,
+                system_percent=system_percent,
+                total_percent=total_percent,
+            ),
+            memory=MemoryStats(
+                rss_avg=rss_avg,
+                rss_max=rss_max,
+            ),
+        )
+
+        self._completed_stats[self._current_task_name] = resource_stats
+        self._current_task_name = None
+        self._current_task_stats = None
+
+    def _monitor_loop(self) -> None:
+        """Background thread for collecting memory samples."""
+        while not self._stop_event.wait(self.interval_ms / 1000):
+            self._collect_memory_sample()
+
+    def _collect_memory_sample(self) -> None:
+        """Collect a memory usage sample."""
+        if not self._is_running or not self._current_task_stats:
+            return
+
+        # Get RSS from resource module (in bytes on macOS, kilobytes on Linux)
+        usage = resource.getrusage(resource.RUSAGE_SELF)
+        # ru_maxrss is in bytes on macOS, kilobytes on Linux
+        import sys
+
+        if sys.platform == "darwin":
+            rss = usage.ru_maxrss  # Already in bytes
+        else:
+            rss = usage.ru_maxrss * 1024  # Convert KB to bytes
+
+        # For more accurate current RSS, try to use /proc on Linux
+        try:
+            with open("/proc/self/statm") as f:
+                # statm gives pages, page size is typically 4096
+                pages = int(f.read().split()[1])
+                rss = pages * 4096
+        except (FileNotFoundError, PermissionError):
+            pass  # Use ru_maxrss fallback
+
+        self._current_task_stats.rss_sum += rss
+        self._current_task_stats.rss_max = max(self._current_task_stats.rss_max, rss)
+        self._current_task_stats.sample_count += 1
+
+    def get_task_stats(self, task_name: str) -> TaskResourceStats | None:
+        """Get resource statistics for a completed task."""
+        return self._completed_stats.get(task_name)
+
+    def get_all_task_names(self) -> list[str]:
+        """Get names of all completed tasks."""
+        return list(self._completed_stats.keys())
+
+    def to_dict(self, task_name: str) -> dict[str, Any] | None:
+        """Get task stats as a dictionary."""
+        stats = self.get_task_stats(task_name)
+        if not stats:
+            return None
+
+        return {
+            "cpu": {
+                "userPercent": stats.cpu.user_percent,
+                "systemPercent": stats.cpu.system_percent,
+                "totalPercent": stats.cpu.total_percent,
+            },
+            "memory": {
+                "rss": {
+                    "avg": stats.memory.rss_avg,
+                    "max": stats.memory.rss_max,
+                },
+            },
+        }
diff --git a/benchmarks/bench/result_utils.py b/benchmarks/bench/result_utils.py
new file mode 100644
index 0000000..1e4a2b6
--- /dev/null
+++ b/benchmarks/bench/result_utils.py
@@ -0,0 +1,337 @@
+"""Result utilities for benchmarks - serialization and comparison."""
+
+from __future__ import annotations
+
+import json
+import os
+import platform
+import statistics
+import uuid
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from .resource_monitor import ResourceMonitor
+
+RESULTS_DIR = Path(__file__).parent.parent / "results"
+
+
+@dataclass
+class MetricSummary:
+    """Summary statistics for a metric."""
+
+    unit: str  # "ns" or "ops/s"
+    mean: float | None = None
+    median: float | None = None
+    min: float | None = None
+    max: float | None = None
+    std_dev: float | None = None
+    p99: float | None = None
+    samples: int = 0
+
+
+@dataclass
+class HistogramBucket:
+    """A histogram bucket."""
+
+    min_ns: float
+    max_ns: float
+    count: int
+
+
+@dataclass
+class TaskBenchmarkResult:
+    """Benchmark result for a single task."""
+
+    name: str
+    samples: int
+    latency: MetricSummary
+    throughput: MetricSummary
+    resource: dict[str, Any] | None = None
+    histogram: list[HistogramBucket] | None = None
+
+
+@dataclass
+class SystemInfo:
+    """System information."""
+
+    python_version: str
+    platform_name: str
+    arch: str
+    cpu_count: int
+    total_memory: int
+    load_average: list[float]
+
+
+@dataclass
+class BenchmarkRunResult:
+    """Complete benchmark run result."""
+
+    id: str
+    label: str
+    timestamp: str
+    duration_ms: float
+    options: dict[str, Any]
+    system: SystemInfo
+    tasks: list[TaskBenchmarkResult] = field(default_factory=list)
+
+
+def get_system_info() -> SystemInfo:
+    """Collect system information."""
+    import psutil  # ty:ignore[unresolved-import]
+
+    return SystemInfo(
+        python_version=platform.python_version(),
+        platform_name=platform.system().lower(),
+        arch=platform.machine(),
+        cpu_count=os.cpu_count() or 1,
+        total_memory=psutil.virtual_memory().total,
+        load_average=list(os.getloadavg()) if hasattr(os, "getloadavg") else [0.0, 0.0, 0.0],
+    )
+
+
+def build_latency_summary(samples_ns: list[float]) -> MetricSummary:
+    """Build latency summary from samples (in nanoseconds)."""
+    if not samples_ns:
+        return MetricSummary(unit="ns", samples=0)
+
+    sorted_samples = sorted(samples_ns)
+    n = len(sorted_samples)
+
+    # Calculate p99
+    p99_idx = int(n * 0.99)
+    p99 = sorted_samples[min(p99_idx, n - 1)]
+
+    return MetricSummary(
+        unit="ns",
+        mean=statistics.mean(samples_ns),
+        median=statistics.median(samples_ns),
+        min=min(samples_ns),
+        max=max(samples_ns),
+        std_dev=statistics.stdev(samples_ns) if len(samples_ns) > 1 else 0.0,
+        p99=p99,
+        samples=n,
+    )
+
+
+def build_throughput_summary(samples_ns: list[float]) -> MetricSummary:
+    """Build throughput summary (ops/s) from latency samples (ns)."""
+    if not samples_ns:
+        return MetricSummary(unit="ops/s", samples=0)
+
+    # Convert ns latencies to ops/s
+    ops_per_sec = [1_000_000_000 / ns if ns > 0 else 0 for ns in samples_ns]
+
+    return MetricSummary(
+        unit="ops/s",
+        mean=statistics.mean(ops_per_sec),
+        median=statistics.median(ops_per_sec),
+        min=min(ops_per_sec),
+        max=max(ops_per_sec),
+        std_dev=statistics.stdev(ops_per_sec) if len(ops_per_sec) > 1 else 0.0,
+        samples=len(ops_per_sec),
+    )
+
+
+def build_histogram(samples_ns: list[float], bucket_count: int = 20) -> list[HistogramBucket]:
+    """Build a histogram from latency samples."""
+    if not samples_ns:
+        return []
+
+    sorted_samples = sorted(samples_ns)
+    min_val = sorted_samples[0]
+    max_val = sorted_samples[-1]
+
+    if min_val == max_val:
+        return [HistogramBucket(min_ns=min_val, max_ns=max_val, count=len(samples_ns))]
+
+    bucket_size = (max_val - min_val) / bucket_count
+    counts = [0] * bucket_count
+
+    for sample in samples_ns:
+        idx = int((sample - min_val) / bucket_size)
+        idx = min(idx, bucket_count - 1)
+        counts[idx] += 1
+
+    buckets = []
+    for i, count in enumerate(counts):
+        bucket_min = min_val + i * bucket_size
+        bucket_max = max_val if i == bucket_count - 1 else min_val + (i + 1) * bucket_size
+        buckets.append(HistogramBucket(min_ns=bucket_min, max_ns=bucket_max, count=count))
+
+    return buckets
+
+
+def create_task_result(
+    name: str,
+    samples_ns: list[float],
+    resource_monitor: ResourceMonitor,
+) -> TaskBenchmarkResult:
+    """Create a task benchmark result from raw samples."""
+    return TaskBenchmarkResult(
+        name=name,
+        samples=len(samples_ns),
+        latency=build_latency_summary(samples_ns),
+        throughput=build_throughput_summary(samples_ns),
+        resource=resource_monitor.to_dict(name),
+        histogram=build_histogram(samples_ns),
+    )
+
+
+def create_benchmark_result(
+    label: str,
+    tasks: list[TaskBenchmarkResult],
+    duration_ms: float,
+    options: dict[str, Any],
+) -> BenchmarkRunResult:
+    """Create a complete benchmark run result."""
+    return BenchmarkRunResult(
+        id=str(uuid.uuid4()),
+        label=label,
+        timestamp=datetime.now(UTC).isoformat(),
+        duration_ms=duration_ms,
+        options=options,
+        system=get_system_info(),
+        tasks=tasks,
+    )
+
+
+def metric_to_dict(metric: MetricSummary) -> dict[str, Any]:
+    """Convert MetricSummary to dict."""
+    return {
+        "unit": metric.unit,
+        "mean": metric.mean,
+        "median": metric.median,
+        "min": metric.min,
+        "max": metric.max,
+        "standardDeviation": metric.std_dev,
+        "p99": metric.p99,
+        "samples": metric.samples,
+    }
+
+
+def task_to_dict(task: TaskBenchmarkResult) -> dict[str, Any]:
+    """Convert TaskBenchmarkResult to dict."""
+    result: dict[str, Any] = {
+        "name": task.name,
+        "samples": task.samples,
+        "latency": metric_to_dict(task.latency),
+        "throughput": metric_to_dict(task.throughput),
+    }
+    if task.resource:
+        result["resource"] = task.resource
+    if task.histogram:
+        result["histogram"] = [{"minNs": b.min_ns, "maxNs": b.max_ns, "count": b.count} for b in task.histogram]
+    return result
+
+
+def result_to_dict(result: BenchmarkRunResult) -> dict[str, Any]:
+    """Convert BenchmarkRunResult to dict."""
+    return {
+        "id": result.id,
+        "label": result.label,
+        "timestamp": result.timestamp,
+        "durationMs": result.duration_ms,
+        "options": result.options,
+        "system": {
+            "pythonVersion": result.system.python_version,
+            "platform": result.system.platform_name,
+            "arch": result.system.arch,
+            "cpuCount": result.system.cpu_count,
+            "totalMemory": result.system.total_memory,
+            "loadAverage": result.system.load_average,
+        },
+        "tasks": [task_to_dict(task) for task in result.tasks],
+    }
+
+
+def persist_result(result: BenchmarkRunResult) -> Path:
+    """Save benchmark result to JSON file."""
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+
+    sanitized_label = "".join(c if c.isalnum() or c in "-_" else "-" for c in result.label)
+    filename = f"{sanitized_label}.json"
+    output_path = RESULTS_DIR / filename
+
+    with open(output_path, "w") as f:
+        json.dump(result_to_dict(result), f, indent=2)
+
+    return output_path
+
+
+def load_result(label: str) -> BenchmarkRunResult | None:
+    """Load a benchmark result from JSON file."""
+    sanitized_label = "".join(c if c.isalnum() or c in "-_" else "-" for c in label)
+    filename = f"{sanitized_label}.json"
+    filepath = RESULTS_DIR / filename
+
+    if not filepath.exists():
+        return None
+
+    with open(filepath) as f:
+        data = json.load(f)
+
+    # Reconstruct the result object
+    system = SystemInfo(
+        python_version=data["system"]["pythonVersion"],
+        platform_name=data["system"]["platform"],
+        arch=data["system"]["arch"],
+        cpu_count=data["system"]["cpuCount"],
+        total_memory=data["system"]["totalMemory"],
+        load_average=data["system"]["loadAverage"],
+    )
+
+    tasks = []
+    for task_data in data["tasks"]:
+        latency = MetricSummary(
+            unit=task_data["latency"]["unit"],
+            mean=task_data["latency"].get("mean"),
+            median=task_data["latency"].get("median"),
+            min=task_data["latency"].get("min"),
+            max=task_data["latency"].get("max"),
+            std_dev=task_data["latency"].get("standardDeviation"),
+            p99=task_data["latency"].get("p99"),
+            samples=task_data["latency"].get("samples", 0),
+        )
+        throughput = MetricSummary(
+            unit=task_data["throughput"]["unit"],
+            mean=task_data["throughput"].get("mean"),
+            median=task_data["throughput"].get("median"),
+            min=task_data["throughput"].get("min"),
+            max=task_data["throughput"].get("max"),
+            std_dev=task_data["throughput"].get("standardDeviation"),
+            samples=task_data["throughput"].get("samples", 0),
+        )
+
+        histogram = None
+        if "histogram" in task_data and task_data["histogram"]:
+            histogram = [
+                HistogramBucket(
+                    min_ns=b["minNs"],
+                    max_ns=b["maxNs"],
+                    count=b["count"],
+                )
+                for b in task_data["histogram"]
+            ]
+
+        tasks.append(
+            TaskBenchmarkResult(
+                name=task_data["name"],
+                samples=task_data["samples"],
+                latency=latency,
+                throughput=throughput,
+                resource=task_data.get("resource"),
+                histogram=histogram,
+            )
+        )
+
+    return BenchmarkRunResult(
+        id=data["id"],
+        label=data["label"],
+        timestamp=data["timestamp"],
+        duration_ms=data["durationMs"],
+        options=data["options"],
+        system=system,
+        tasks=tasks,
+    )
diff --git a/benchmarks/bench/sdk_active.py b/benchmarks/bench/sdk_active.py
new file mode 100644
index 0000000..59ffb2e
--- /dev/null
+++ b/benchmarks/bench/sdk_active.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+"""Benchmark: SDK Active (RECORD mode)."""
+
+import os
+import shutil
+import sys
+from pathlib import Path
+
+# Set SDK to RECORD mode
+os.environ["TUSK_DRIFT_MODE"] = "RECORD"
+
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+# Clean up benchmark traces directory
+BENCHMARK_TRACE_DIR = Path(__file__).parent.parent / ".benchmark-traces"
+if BENCHMARK_TRACE_DIR.exists():
+    try:
+        shutil.rmtree(BENCHMARK_TRACE_DIR)
+    except Exception as e:
+        print(f"Warning: Failed to clean benchmark trace directory: {e}")
+
+# Initialize SDK BEFORE importing common (which imports requests)
+from drift import TuskDrift
+from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter
+
+# Initialize the SDK
+sdk = TuskDrift.initialize(
+    api_key="benchmark-test-key",
+    env="benchmark",
+    log_level="warn",  # Reduce log noise during benchmarks
+)
+
+# Configure filesystem adapter for traces
+if sdk.span_exporter:
+    sdk.span_exporter.clear_adapters()
+    adapter = FilesystemSpanAdapter(base_directory=BENCHMARK_TRACE_DIR)
+    sdk.span_exporter.add_adapter(adapter)
+
+# Mark app as ready
+sdk.mark_app_as_ready()
+
+from benchmarks.bench.common import run_benchmarks
+
+if __name__ == "__main__":
+    try:
+        run_benchmarks(label="sdk-active")
+    finally:
+        sdk.shutdown()
diff --git a/benchmarks/bench/sdk_active_with_transforms.py b/benchmarks/bench/sdk_active_with_transforms.py
new file mode 100644
index 0000000..dde9428
--- /dev/null
+++ b/benchmarks/bench/sdk_active_with_transforms.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""Benchmark: SDK Active with Transforms (RECORD mode + data transformation)."""
+
+import os
+import shutil
+import sys
+from pathlib import Path
+
+# Set SDK to RECORD mode
+os.environ["TUSK_DRIFT_MODE"] = "RECORD"
+
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+# Clean up benchmark traces directory
+BENCHMARK_TRACE_DIR = Path(__file__).parent.parent / ".benchmark-traces-transforms"
+if BENCHMARK_TRACE_DIR.exists():
+    try:
+        shutil.rmtree(BENCHMARK_TRACE_DIR)
+    except Exception as e:
+        print(f"Warning: Failed to clean benchmark trace directory: {e}")
+
+# Initialize SDK BEFORE importing common (which imports requests)
+from drift import TuskDrift
+from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter
+
+# Define transforms to mask sensitive data
+transforms = {
+    "request": {
+        "body": [
+            {"path": "$.password", "action": "redact"},
+            {"path": "$.ssn", "action": "redact"},
+            {"path": "$.creditCard", "action": "redact"},
+            {"path": "$.email", "action": "hash"},
+        ],
+    },
+    "response": {
+        "body": [
+            {"path": "$.token", "action": "redact"},
+            {"path": "$.user.email", "action": "hash"},
+            {"path": "$.profile.ssn", "action": "redact"},
+            {"path": "$.profile.creditCard", "action": "redact"},
+            {"path": "$.profile.email", "action": "hash"},
+        ],
+    },
+}
+
+# Initialize the SDK with transforms
+sdk = TuskDrift.initialize(
+    api_key="benchmark-test-key",
+    env="benchmark",
+    transforms=transforms,
+    log_level="warn",  # Reduce log noise during benchmarks
+)
+
+# Configure filesystem adapter for traces
+if sdk.span_exporter:
+    sdk.span_exporter.clear_adapters()
+    adapter = FilesystemSpanAdapter(base_directory=BENCHMARK_TRACE_DIR)
+    sdk.span_exporter.add_adapter(adapter)
+
+# Mark app as ready
+sdk.mark_app_as_ready()
+
+from benchmarks.bench.common import run_benchmarks
+
+if __name__ == "__main__":
+    try:
+        run_benchmarks(label="sdk-active-with-transforms")
+    finally:
+        sdk.shutdown()
diff --git a/benchmarks/bench/sdk_disabled.py b/benchmarks/bench/sdk_disabled.py
new file mode 100644
index 0000000..845c2b5
--- /dev/null
+++ b/benchmarks/bench/sdk_disabled.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+"""Benchmark: SDK Disabled (baseline)."""
+
+import os
+import sys
+
+# Ensure SDK is disabled
+os.environ["TUSK_DRIFT_MODE"] = "DISABLED"
+
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from benchmarks.bench.common import run_benchmarks
+
+if __name__ == "__main__":
+    run_benchmarks(label="sdk-disabled")
diff --git a/benchmarks/bench/sdk_sampling_rates.py b/benchmarks/bench/sdk_sampling_rates.py
new file mode 100644
index 0000000..929362c
--- /dev/null
+++ b/benchmarks/bench/sdk_sampling_rates.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""Benchmark: SDK with different sampling rates."""
+
+import os
+import shutil
+import sys
+from pathlib import Path
+
+# Set SDK to RECORD mode
+os.environ["TUSK_DRIFT_MODE"] = "RECORD"
+
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+# Clean up benchmark traces directory
+BENCHMARK_TRACE_DIR = Path(__file__).parent.parent / ".benchmark-traces-sampling"
+
+
+def run_with_sampling_rate(sampling_rate: float) -> dict | None:
+    """Run benchmark with a specific sampling rate."""
+    # Clean traces directory
+    if BENCHMARK_TRACE_DIR.exists():
+        try:
+            shutil.rmtree(BENCHMARK_TRACE_DIR)
+        except Exception as e:
+            print(f"Warning: Failed to clean benchmark trace directory: {e}")
+
+    # We need to reset the SDK for each sampling rate test
+    # This requires a fresh Python process, so we'll use subprocess
+    import subprocess
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            "-c",
+            f"""
+import os
+import sys
+import shutil
+from pathlib import Path
+
+os.environ["TUSK_DRIFT_MODE"] = "RECORD"
+sys.path.insert(0, "{os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))}")
+
+BENCHMARK_TRACE_DIR = Path("{BENCHMARK_TRACE_DIR}")
+if BENCHMARK_TRACE_DIR.exists():
+    shutil.rmtree(BENCHMARK_TRACE_DIR)
+
+from drift import TuskDrift
+from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter
+
+sdk = TuskDrift.initialize(
+    api_key="benchmark-test-key",
+    env="benchmark",
+    sampling_rate={sampling_rate},
+    log_level="warning",
+)
+
+if sdk.span_exporter:
+    sdk.span_exporter.clear_adapters()
+    adapter = FilesystemSpanAdapter(base_directory=BENCHMARK_TRACE_DIR)
+    sdk.span_exporter.add_adapter(adapter)
+
+sdk.mark_app_as_ready()
+
+from benchmarks.bench.common import run_benchmarks
+run_benchmarks(label="sdk-sampling-{int(sampling_rate * 100)}", options={{"time_per_task_ms": 5000, "warmup_iterations": 3}})
+
+sdk.shutdown()
+""",
+        ],
+        capture_output=True,
+        text=True,
+        cwd=os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+    )
+
+    if result.returncode != 0:
+        print(f"Error running benchmark with sampling_rate={sampling_rate}:")
+        print(result.stderr)
+        return None
+
+    # Print stdout (benchmark output)
+    print(result.stdout)
+
+    return {"sampling_rate": sampling_rate}
+
+
+def main():
+    """Run benchmarks with different sampling rates."""
+    sampling_rates = [1.0, 0.5, 0.1, 0.01]
+
+    print("=" * 60)
+    print("Sampling Rate Impact Benchmarks")
+    print("=" * 60)
+
+    for rate in sampling_rates:
+        print(f"\n{'=' * 60}")
+        print(f"Testing sampling_rate = {rate} ({int(rate * 100)}%)")
+        print("=" * 60)
+        run_with_sampling_rate(rate)
+
+    print("\n" + "=" * 60)
+    print("All sampling rate benchmarks complete!")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/compare_benchmarks.py b/benchmarks/compare_benchmarks.py
new file mode 100644
index 0000000..b2d5489
--- /dev/null
+++ b/benchmarks/compare_benchmarks.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+"""Compare benchmark results and generate summary."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from benchmarks.bench.result_utils import (
+    BenchmarkRunResult,
+    TaskBenchmarkResult,
+    load_result,
+)
+
+# Task base names (without the label suffix)
+CPU_HEAVY_TASK_BASE = "High CPU: POST /api/compute-hash"
+IO_HEAVY_TASK_BASE = "High IO, Low CPU: POST /api/io-bound"
+TRANSFORM_TASK_BASE = "Transform endpoints"
+
+
+def find_task_by_base_name(tasks: list[TaskBenchmarkResult], base_name: str) -> TaskBenchmarkResult | None:
+    """Find a task by its base name (ignoring the label suffix)."""
+    for task in tasks:
+        if task.name.startswith(base_name + " ("):
+            return task
+    return None
+
+
+def percentage_change(baseline: float | None, variant: float | None) -> float | None:
+    """Calculate percentage change from baseline to variant."""
+    if baseline is None or variant is None or baseline == 0:
+        return None
+    return ((variant - baseline) / baseline) * 100
+
+
+def format_percentage(value: float | None) -> str:
+    """Format a percentage value."""
+    if value is None:
+        return "n/a"
+    sign = "+" if value >= 0 else ""
+    return f"{sign}{value:.1f}%"
+
+
+def format_latency(ns: float | None) -> str:
+    """Format latency in appropriate units."""
+    if ns is None:
+        return "n/a"
+    if ns < 1_000:
+        return f"{ns:.0f} ns"
+    if ns < 1_000_000:
+        return f"{ns / 1_000:.2f} μs"
+    if ns < 1_000_000_000:
+        return f"{ns / 1_000_000:.2f} ms"
+    return f"{ns / 1_000_000_000:.2f} s"
+
+
+def format_throughput(value: float | None) -> str:
+    """Format throughput in ops/s."""
+    if value is None:
+        return "n/a"
+    return f"{value:,.0f}"
+
+
+def format_bytes(bytes_val: float | None) -> str:
+    """Format bytes in MB."""
+    if bytes_val is None:
+        return "n/a"
+    mb = bytes_val / (1024 * 1024)
+    return f"{mb:.2f} MB"
+
+
+def format_cpu_percent(value: float | None) -> str:
+    """Format CPU percentage."""
+    if value is None:
+        return "n/a"
+    return f"{value:.1f}%"
+
+
+def compute_impact(
+    label: str,
+    baseline: TaskBenchmarkResult | None,
+    variant: TaskBenchmarkResult | None,
+) -> dict | None:
+    """Compute impact metrics between baseline and variant."""
+    if not baseline or not variant:
+        return None
+
+    baseline_throughput = baseline.throughput.mean
+    variant_throughput = variant.throughput.mean
+    baseline_p99 = baseline.latency.p99
+    variant_p99 = variant.latency.p99
+
+    # CPU stats
+    baseline_cpu_user = baseline.resource.get("cpu", {}).get("userPercent") if baseline.resource else None
+    variant_cpu_user = variant.resource.get("cpu", {}).get("userPercent") if variant.resource else None
+    baseline_cpu_total = baseline.resource.get("cpu", {}).get("totalPercent") if baseline.resource else None
+    variant_cpu_total = variant.resource.get("cpu", {}).get("totalPercent") if variant.resource else None
+
+    return {
+        "label": label,
+        "throughput_delta_pct": percentage_change(baseline_throughput, variant_throughput),
+        "tail_latency_delta_pct": percentage_change(baseline_p99, variant_p99),
+        "baseline_throughput": baseline_throughput,
+        "variant_throughput": variant_throughput,
+        "baseline_p99": baseline_p99,
+        "variant_p99": variant_p99,
+        "cpu_user_delta_pct": percentage_change(baseline_cpu_user, variant_cpu_user),
+        "baseline_cpu_total": baseline_cpu_total,
+        "variant_cpu_total": variant_cpu_total,
+    }
+
+
+def compute_memory_overhead(
+    baseline: BenchmarkRunResult,
+    variant: BenchmarkRunResult,
+) -> dict:
+    """Compute memory overhead between baseline and variant."""
+    rss_deltas = []
+    rss_max_delta = None
+
+    for baseline_task in baseline.tasks:
+        # Find matching task in variant
+        base_name = baseline_task.name.rsplit(" (", 1)[0]
+        variant_task = find_task_by_base_name(variant.tasks, base_name)
+
+        if not variant_task:
+            continue
+
+        baseline_mem = baseline_task.resource.get("memory", {}) if baseline_task.resource else {}
+        variant_mem = variant_task.resource.get("memory", {}) if variant_task.resource else {}
+
+        baseline_rss = baseline_mem.get("rss", {})
+        variant_rss = variant_mem.get("rss", {})
+
+        if baseline_rss.get("avg") is not None and variant_rss.get("avg") is not None:
+            delta = variant_rss["avg"] - baseline_rss["avg"]
+            rss_deltas.append(delta)
+
+        if baseline_rss.get("max") is not None and variant_rss.get("max") is not None:
+            delta = variant_rss["max"] - baseline_rss["max"]
+            if rss_max_delta is None or delta > rss_max_delta:
+                rss_max_delta = delta
+
+    return {
+        "avg_rss_delta": sum(rss_deltas) / len(rss_deltas) if rss_deltas else None,
+        "max_rss_delta": rss_max_delta,
+        "samples": len(rss_deltas),
+    }
+
+
+def main():
+    """Main comparison function."""
+    print("\nLoading benchmark results...")
+
+    baseline = load_result("sdk-disabled")
+    active = load_result("sdk-active")
+    transforms = load_result("sdk-active-with-transforms")
+
+    if not baseline:
+        print("ERROR: sdk-disabled results not found. Run benchmarks first.")
+        return
+    if not active:
+        print("ERROR: sdk-active results not found. Run benchmarks first.")
+        return
+
+    print(f"\nBaseline: {baseline.label} ({baseline.timestamp})")
+    print(f"Active: {active.label} ({active.timestamp})")
+    if transforms:
+        print(f"Transforms: {transforms.label} ({transforms.timestamp})")
+
+    # Compute impacts
+    cpu_impact = compute_impact(
+        CPU_HEAVY_TASK_BASE,
+        find_task_by_base_name(baseline.tasks, CPU_HEAVY_TASK_BASE),
+        find_task_by_base_name(active.tasks, CPU_HEAVY_TASK_BASE),
+    )
+
+    io_impact = compute_impact(
+        IO_HEAVY_TASK_BASE,
+        find_task_by_base_name(baseline.tasks, IO_HEAVY_TASK_BASE),
+        find_task_by_base_name(active.tasks, IO_HEAVY_TASK_BASE),
+    )
+
+    transform_impact = None
+    if transforms:
+        transform_impact = compute_impact(
+            TRANSFORM_TASK_BASE,
+            find_task_by_base_name(active.tasks, TRANSFORM_TASK_BASE),
+            find_task_by_base_name(transforms.tasks, TRANSFORM_TASK_BASE),
+        )
+
+    # Print performance impact table
+    print("\n## Benchmark Impact Summary\n")
+    print("| Workload | Throughput Δ | Tail Latency (p99) Δ | CPU User Δ |")
+    print("|----------|-------------|----------------------|------------|")
+
+    if cpu_impact:
+        print(
+            f"| **CPU-bound** | {format_percentage(cpu_impact['throughput_delta_pct'])} | "
+            f"{format_percentage(cpu_impact['tail_latency_delta_pct'])} | "
+            f"{format_percentage(cpu_impact['cpu_user_delta_pct'])} |"
+        )
+    else:
+        print("| **CPU-bound** | N/A | N/A | N/A |")
+
+    if io_impact:
+        print(
+            f"| **IO-bound** | {format_percentage(io_impact['throughput_delta_pct'])} | "
+            f"{format_percentage(io_impact['tail_latency_delta_pct'])} | "
+            f"{format_percentage(io_impact['cpu_user_delta_pct'])} |"
+        )
+    else:
+        print("| **IO-bound** | N/A | N/A | N/A |")
+
+    if transform_impact:
+        print(
+            f"| **Transform endpoints** | {format_percentage(transform_impact['throughput_delta_pct'])} | "
+            f"{format_percentage(transform_impact['tail_latency_delta_pct'])} | "
+            f"{format_percentage(transform_impact['cpu_user_delta_pct'])} |"
+        )
+    else:
+        print("| **Transform endpoints** | N/A | N/A | N/A |")
+
+    # Print absolute values table
+    print("\n## Absolute Values\n")
+    print("| Workload | Baseline Throughput | Active Throughput | Baseline p99 | Active p99 |")
+    print("|----------|--------------------|--------------------|--------------|------------|")
+
+    if cpu_impact:
+        print(
+            f"| **CPU-bound** | {format_throughput(cpu_impact['baseline_throughput'])} ops/s | "
+            f"{format_throughput(cpu_impact['variant_throughput'])} ops/s | "
+            f"{format_latency(cpu_impact['baseline_p99'])} | "
+            f"{format_latency(cpu_impact['variant_p99'])} |"
+        )
+
+    if io_impact:
+        print(
+            f"| **IO-bound** | {format_throughput(io_impact['baseline_throughput'])} ops/s | "
+            f"{format_throughput(io_impact['variant_throughput'])} ops/s | "
+            f"{format_latency(io_impact['baseline_p99'])} | "
+            f"{format_latency(io_impact['variant_p99'])} |"
+        )
+
+    # Memory overhead
+    print("\n## Memory Overhead vs Baseline\n")
+    print("| Configuration | Avg RSS Δ | Max RSS Δ |")
+    print("|---------------|-----------|-----------|")
+
+    active_memory = compute_memory_overhead(baseline, active)
+    if active_memory["samples"] > 0:
+        print(
+            f"| **SDK Active** | {format_bytes(active_memory['avg_rss_delta'])} | "
+            f"{format_bytes(active_memory['max_rss_delta'])} |"
+        )
+    else:
+        print("| **SDK Active** | N/A | N/A |")
+
+    if transforms:
+        transform_memory = compute_memory_overhead(baseline, transforms)
+        if transform_memory["samples"] > 0:
+            print(
+                f"| **SDK Active w/ Transforms** | {format_bytes(transform_memory['avg_rss_delta'])} | "
+                f"{format_bytes(transform_memory['max_rss_delta'])} |"
+            )
+        else:
+            print("| **SDK Active w/ Transforms** | N/A | N/A |")
+
+    print("\n✓ Summary complete.\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/profile/.gitignore b/benchmarks/profile/.gitignore
new file mode 100644
index 0000000..fcb8cf5
--- /dev/null
+++ b/benchmarks/profile/.gitignore
@@ -0,0 +1,2 @@
+# Profile results
+results/
diff --git a/benchmarks/profile/profile.sh b/benchmarks/profile/profile.sh
new file mode 100755
index 0000000..6583ac9
--- /dev/null
+++ b/benchmarks/profile/profile.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+# Profile the SDK using different profilers
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$(dirname "$SCRIPT_DIR")")"
+RESULTS_DIR="$SCRIPT_DIR/results"
+
+cd "$PROJECT_ROOT"
+
+mkdir -p "$RESULTS_DIR"
+
+PROFILER="${1:-cprofile}"  # Default to cProfile
+
+case "$PROFILER" in
+    "pyspy"|"py-spy"|"flamegraph")
+        echo "=============================================="
+        echo "Running py-spy flame graph profiler"
+        echo "=============================================="
+        
+        if ! command -v py-spy &> /dev/null; then
+            echo "py-spy not found. Install with: pip install py-spy"
+            echo "Or on macOS: brew install py-spy"
+            exit 1
+        fi
+        
+        OUTPUT="$RESULTS_DIR/flamegraph_$(date +%Y%m%d_%H%M%S).svg"
+        echo "Output: $OUTPUT"
+        echo ""
+        
+        # py-spy needs sudo on macOS for sampling
+        if [[ "$OSTYPE" == "darwin"* ]]; then
+            echo "Note: py-spy may require sudo on macOS"
+            sudo py-spy record -o "$OUTPUT" --rate 100 -- python benchmarks/profile/simple_profile.py
+        else
+            py-spy record -o "$OUTPUT" --rate 100 -- python benchmarks/profile/simple_profile.py
+        fi
+        
+        echo ""
+        echo "Flame graph saved to: $OUTPUT"
+        echo "Open in browser to view."
+        ;;
+        
+    "cprofile"|"cProfile")
+        echo "=============================================="
+        echo "Running cProfile deterministic profiler"
+        echo "=============================================="
+        
+        OUTPUT="$RESULTS_DIR/cprofile_$(date +%Y%m%d_%H%M%S).prof"
+        echo "Output: $OUTPUT"
+        echo ""
+        
+        python -m cProfile -o "$OUTPUT" benchmarks/profile/simple_profile.py
+        
+        echo ""
+        echo "Profile saved to: $OUTPUT"
+        echo ""
+        echo "Analyzing top functions by cumulative time..."
+        echo ""
+        
+        python -c "
+import pstats
+p = pstats.Stats('$OUTPUT')
+p.strip_dirs()
+p.sort_stats('cumulative')
+print('=== Top 30 by cumulative time ===')
+p.print_stats(30)
+print()
+print('=== Top 30 by total time (self) ===')
+p.sort_stats('tottime')
+p.print_stats(30)
+"
+        ;;
+        
+    "scalene")
+        echo "=============================================="
+        echo "Running Scalene CPU/memory profiler"
+        echo "=============================================="
+        
+        if ! command -v scalene &> /dev/null; then
+            echo "scalene not found. Install with: pip install scalene"
+            exit 1
+        fi
+        
+        OUTPUT="$RESULTS_DIR/scalene_$(date +%Y%m%d_%H%M%S).html"
+        echo "Output: $OUTPUT"
+        echo ""
+        
+        scalene --html --outfile "$OUTPUT" benchmarks/profile/simple_profile.py
+        
+        echo ""
+        echo "Scalene report saved to: $OUTPUT"
+        ;;
+        
+    "viztracer")
+        echo "=============================================="
+        echo "Running VizTracer timeline profiler"
+        echo "=============================================="
+        
+        if ! python -c "import viztracer" 2>/dev/null; then
+            echo "viztracer not found. Install with: pip install viztracer"
+            exit 1
+        fi
+        
+        OUTPUT="$RESULTS_DIR/viztracer_$(date +%Y%m%d_%H%M%S).json"
+        echo "Output: $OUTPUT"
+        echo ""
+        
+        python -m viztracer -o "$OUTPUT" benchmarks/profile/simple_profile.py
+        
+        echo ""
+        echo "VizTracer trace saved to: $OUTPUT"
+        echo "View with: vizviewer $OUTPUT"
+        ;;
+        
+    *)
+        echo "Usage: $0 [profiler]"
+        echo ""
+        echo "Available profilers:"
+        echo "  cprofile   - Built-in deterministic profiler (default)"
+        echo "  pyspy      - Sampling profiler with flame graphs"
+        echo "  scalene    - CPU/memory profiler with line-level detail"
+        echo "  viztracer  - Timeline/trace profiler"
+        echo ""
+        echo "Examples:"
+        echo "  $0 cprofile    # Run cProfile"
+        echo "  $0 pyspy       # Generate flame graph"
+        echo "  $0 scalene     # CPU/memory analysis"
+        exit 1
+        ;;
+esac
+
+echo ""
+echo "Done!"
diff --git a/benchmarks/profile/simple_profile.py b/benchmarks/profile/simple_profile.py
new file mode 100644
index 0000000..22bfb1e
--- /dev/null
+++ b/benchmarks/profile/simple_profile.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+"""Simple profiling script for the SDK.
+
+This script can be used with various profilers:
+
+1. py-spy (flame graphs):
+   py-spy record -o profile.svg -- python benchmarks/profile/simple_profile.py
+
+2. cProfile (call stats):
+   python -m cProfile -o profile.prof benchmarks/profile/simple_profile.py
+   # Then view with: python -m pstats profile.prof
+
+3. scalene (CPU/memory):
+   scalene benchmarks/profile/simple_profile.py
+"""
+
+import os
+import sys
+import time
+
+# Set SDK to RECORD mode
+os.environ["TUSK_DRIFT_MODE"] = "RECORD"
+
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from pathlib import Path
+
+import requests
+
+from drift import TuskDrift
+from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter
+
+# Setup traces directory
+PROFILE_TRACE_DIR = Path(__file__).parent / "results" / "traces"
+PROFILE_TRACE_DIR.mkdir(parents=True, exist_ok=True)
+
+# Initialize SDK
+sdk = TuskDrift.initialize(
+    api_key="profile-test-key",
+    env="profile",
+    log_level="warn",
+)
+
+if sdk.span_exporter:
+    sdk.span_exporter.clear_adapters()
+    adapter = FilesystemSpanAdapter(base_directory=PROFILE_TRACE_DIR)
+    sdk.span_exporter.add_adapter(adapter)
+
+sdk.mark_app_as_ready()
+
+# Import server after SDK init
+from benchmarks.server.test_server import TestServer
+
+
+def main():
+    """Run profiling workload."""
+    # Start test server
+    server = TestServer()
+    server_info = server.start()
+    server_url = server_info["url"]
+
+    # Wait for server
+    time.sleep(0.5)
+
+    # Use session for connection pooling
+    session = requests.Session()
+
+    print(f"\nTest server started at {server_url}")
+    print("Running profiling workload...")
+
+    # Number of iterations (adjust for longer/shorter profiling)
+    iterations = 500
+
+    start_time = time.time()
+
+    for i in range(iterations):
+        # Mix of different request types
+        if i % 3 == 0:
+            # Simple GET
+            response = session.get(f"{server_url}/api/simple")
+            response.json()
+        elif i % 3 == 1:
+            # POST with JSON body
+            response = session.post(
+                f"{server_url}/api/echo",
+                json={"data": "test", "iteration": i},
+            )
+            response.json()
+        else:
+            # Sensitive data endpoint (triggers transform checks)
+            response = session.post(
+                f"{server_url}/api/auth/login",
+                json={"email": "test@example.com", "password": "secret123"},
+            )
+            response.json()
+
+        if (i + 1) % 100 == 0:
+            elapsed = time.time() - start_time
+            rate = (i + 1) / elapsed
+            print(f"  {i + 1}/{iterations} iterations ({rate:.1f} req/s)")
+
+    elapsed = time.time() - start_time
+    print(f"\nCompleted {iterations} iterations in {elapsed:.2f}s")
+    print(f"Average: {iterations / elapsed:.1f} req/s")
+
+    # Cleanup
+    session.close()
+    server.stop()
+    sdk.shutdown()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/run_benchmarks.sh b/benchmarks/run_benchmarks.sh
new file mode 100755
index 0000000..6367d74
--- /dev/null
+++ b/benchmarks/run_benchmarks.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Run all benchmarks and compare results
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+cd "$PROJECT_ROOT"
+
+echo "=============================================="
+echo "Drift Python SDK Benchmarks"
+echo "=============================================="
+echo ""
+
+# Check for psutil
+if ! python -c "import psutil" 2>/dev/null; then
+    echo "Installing psutil..."
+    uv pip install psutil
+fi
+
+# Check for flask
+if ! python -c "import flask" 2>/dev/null; then
+    echo "Installing flask..."
+    uv pip install flask
+fi
+
+echo ""
+echo "Running SDK Disabled (baseline)..."
+echo "----------------------------------------------"
+python benchmarks/bench/sdk_disabled.py
+
+echo ""
+echo "Running SDK Active..."
+echo "----------------------------------------------"
+python benchmarks/bench/sdk_active.py
+
+echo ""
+echo "Running SDK Active with Transforms..."
+echo "----------------------------------------------"
+python benchmarks/bench/sdk_active_with_transforms.py
+
+echo ""
+echo "=============================================="
+echo "Comparing Results"
+echo "=============================================="
+python benchmarks/compare_benchmarks.py
diff --git a/benchmarks/server/__init__.py b/benchmarks/server/__init__.py
new file mode 100644
index 0000000..087929f
--- /dev/null
+++ b/benchmarks/server/__init__.py
@@ -0,0 +1 @@
+# Test server for benchmarks
diff --git a/benchmarks/server/test_server.py b/benchmarks/server/test_server.py
new file mode 100644
index 0000000..5cdddb3
--- /dev/null
+++ b/benchmarks/server/test_server.py
@@ -0,0 +1,420 @@
+"""Test server for benchmarks - provides various endpoint types for measuring SDK overhead."""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import threading
+from typing import Any
+
+from flask import Flask, Response, request
+
+
+class TestServer:
+    """Flask-based test server for benchmarks."""
+
+    def __init__(self, host: str = "127.0.0.1", port: int = 0):
+        self.host = host
+        self.port = port
+        self.app = Flask(__name__)
+        self._server_thread: threading.Thread | None = None
+        self._actual_port: int | None = None
+        self._setup_routes()
+
+    def _setup_routes(self) -> None:
+        """Set up all benchmark endpoints."""
+
+        @self.app.route("/health")
+        def health() -> Response:
+            return Response(
+                json.dumps({"status": "ok"}),
+                mimetype="application/json",
+            )
+
+        @self.app.route("/api/simple")
+        def simple() -> Response:
+            """Minimal processing endpoint."""
+            import time
+
+            return Response(
+                json.dumps({"message": "Hello World", "timestamp": int(time.time() * 1000)}),
+                mimetype="application/json",
+            )
+
+        @self.app.route("/api/simple-post", methods=["POST"])
+        def simple_post() -> Response:
+            """Minimal POST endpoint."""
+            import time
+
+            return Response(
+                json.dumps({"message": "Hello World", "timestamp": int(time.time() * 1000)}),
+                mimetype="application/json",
+            )
+
+        @self.app.route("/api/echo", methods=["POST"])
+        def echo() -> Response:
+            """Echo back request body."""
+            data = request.get_json(force=True, silent=True) or {}
+            return Response(json.dumps(data), mimetype="application/json")
+
+        @self.app.route("/api/small")
+        def small() -> Response:
+            """Return ~100KB payload."""
+            import time
+
+            data = {
+                "id": "small-123",
+                "data": "x" * (100 * 1024),
+                "timestamp": int(time.time() * 1000),
+            }
+            return Response(json.dumps(data), mimetype="application/json")
+
+        @self.app.route("/api/small-post", methods=["POST"])
+        def small_post() -> Response:
+            """Accept and return ~100KB payload."""
+            import time
+
+            data = {
+                "id": "small-post-123",
+                "data": "x" * (100 * 1024),
+                "timestamp": int(time.time() * 1000),
+            }
+            return Response(json.dumps(data), mimetype="application/json")
+
+        @self.app.route("/api/medium")
+        def medium() -> Response:
+            """Return ~1MB payload."""
+            import time
+
+            data = {
+                "id": "medium-456",
+                "data": "x" * (1024 * 1024),
+                "timestamp": int(time.time() * 1000),
+            }
+            return Response(json.dumps(data), mimetype="application/json")
+
+        @self.app.route("/api/medium-post", methods=["POST"])
+        def medium_post() -> Response:
+            """Accept and return ~1MB payload."""
+            import time
+
+            data = {
+                "id": "medium-post-456",
+                "data": "x" * (1024 * 1024),
+                "timestamp": int(time.time() * 1000),
+            }
+            return Response(json.dumps(data), mimetype="application/json")
+
+        @self.app.route("/api/large")
+        def large() -> Response:
+            """Return ~2MB payload."""
+            import time
+
+            data = {
+                "id": "large-789",
+                "data": "x" * (2 * 1024 * 1024),
+                "timestamp": int(time.time() * 1000),
+            }
+            return Response(json.dumps(data), mimetype="application/json")
+
+        @self.app.route("/api/large-post", methods=["POST"])
+        def large_post() -> Response:
+            """Accept and return ~2MB payload."""
+            import time
+
+            data = {
+                "id": "large-post-789",
+                "data": "x" * (2 * 1024 * 1024),
+                "timestamp": int(time.time() * 1000),
+            }
+            return Response(json.dumps(data), mimetype="application/json")
+
+        @self.app.route("/api/compute-hash", methods=["POST"])
+        def compute_hash() -> Response:
+            """CPU-intensive endpoint - iterative hashing."""
+            data = request.get_json(force=True, silent=True) or {}
+            input_data = data.get("data", "default-data")
+            iterations = data.get("iterations", 1000)
+
+            hash_val = input_data
+            for _ in range(iterations):
+                hash_val = hashlib.sha256(hash_val.encode()).hexdigest()
+
+            return Response(
+                json.dumps({"hash": hash_val, "iterations": iterations}),
+                mimetype="application/json",
+            )
+
+        @self.app.route("/api/compute-json", methods=["POST"])
+        def compute_json() -> Response:
+            """CPU-intensive endpoint - JSON parsing/stringifying."""
+            req_data = request.get_json(force=True, silent=True) or {}
+            iterations = req_data.get("iterations", 100)
+            data = req_data.get("data", {"test": "data", "nested": {"value": 123}})
+
+            result = data
+            for i in range(iterations):
+                str_data = json.dumps(result)
+                result = json.loads(str_data)
+                result["iteration"] = i
+
+            return Response(json.dumps(result), mimetype="application/json")
+
+        @self.app.route("/api/auth/login", methods=["POST"])
+        def auth_login() -> Response:
+            """Endpoint with sensitive data for transform testing."""
+            data = request.get_json(force=True, silent=True) or {}
+            return Response(
+                json.dumps(
+                    {
+                        "success": True,
+                        "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
+                        "user": {
+                            "id": 123,
+                            "email": data.get("email", "user@example.com"),
+                            "role": "user",
+                        },
+                    }
+                ),
+                mimetype="application/json",
+            )
+
+        @self.app.route("/api/users", methods=["POST"])
+        def create_user() -> Response:
+            """Endpoint with nested sensitive data."""
+            import time
+
+            data = request.get_json(force=True, silent=True) or {}
+            return Response(
+                json.dumps(
+                    {
+                        "id": int(time.time() * 1000),
+                        "username": data.get("username", "testuser"),
+                        "profile": {
+                            "email": data.get("email", "test@example.com"),
+                            "ssn": data.get("ssn", "123-45-6789"),
+                            "creditCard": data.get("creditCard", "4111-1111-1111-1111"),
+                        },
+                        "createdAt": "2025-01-14T00:00:00Z",
+                    }
+                ),
+                mimetype="application/json",
+            )
+
+        @self.app.route("/api/io-bound", methods=["POST"])
+        def io_bound() -> Response:
+            """High IO, low CPU endpoint - simulates IO-bound work."""
+            import time
+
+            data = request.get_json(force=True, silent=True) or {}
+            jobs = data.get("jobs", 5)
+            delay_ms = data.get("delayMs", 5)
+
+            results = []
+            for i in range(jobs):
+                time.sleep(delay_ms / 1000)
+                results.append(
+                    {
+                        "jobId": i,
+                        "timestamp": int(time.time() * 1000),
+                        "status": "completed",
+                    }
+                )
+
+            return Response(
+                json.dumps(
+                    {
+                        "totalJobs": jobs,
+                        "results": results,
+                        "completedAt": int(time.time() * 1000),
+                    }
+                ),
+                mimetype="application/json",
+            )
+
+        @self.app.route("/api/slow")
+        def slow() -> Response:
+            """Slow endpoint - simulates slow processing."""
+            import time
+
+            time.sleep(0.1)
+            return Response(
+                json.dumps({"message": "Slow response", "timestamp": int(time.time() * 1000)}),
+                mimetype="application/json",
+            )
+
+        @self.app.route("/api/error")
+        def error() -> Response:
+            """Error endpoint."""
+            return Response(
+                json.dumps({"error": "Internal Server Error"}),
+                status=500,
+                mimetype="application/json",
+            )
+
+        @self.app.route("/api/realistic", methods=["POST"])
+        def realistic() -> Response:
+            """Realistic API endpoint - simulates typical production workload.
+
+            This endpoint simulates a real API that:
+            1. Validates input (light CPU)
+            2. Queries a database (I/O wait - simulated with sleep)
+            3. Processes results (moderate CPU)
+            4. Returns response
+
+            Target: ~10-20ms response time (typical for production APIs)
+            """
+            import re
+            import time
+
+            data = request.get_json(force=True, silent=True) or {}
+
+            # 1. Input validation (light CPU work)
+            user_id = data.get("userId", "user-123")
+            query = data.get("query", "default query")
+
+            # Validate email format if provided
+            email = data.get("email", "")
+            email_valid = bool(re.match(r"[^@]+@[^@]+\.[^@]+", email)) if email else True
+
+            # 2. Simulate database query (I/O wait ~10ms)
+            time.sleep(0.010)
+
+            # 3. Process results (moderate CPU - JSON manipulation)
+            results = []
+            for i in range(10):
+                item = {
+                    "id": f"item-{i}",
+                    "userId": user_id,
+                    "data": {"index": i, "query": query[:50]},
+                    "score": (i + 1) * 0.1,
+                }
+                # Serialize and deserialize (simulates data transformation)
+                item_str = json.dumps(item)
+                results.append(json.loads(item_str))
+
+            # 4. Build response
+            response_data = {
+                "success": True,
+                "userId": user_id,
+                "emailValid": email_valid,
+                "resultCount": len(results),
+                "results": results,
+                "processingTimeMs": 10,  # Approximate
+                "timestamp": int(time.time() * 1000),
+            }
+
+            return Response(json.dumps(response_data), mimetype="application/json")
+
+        @self.app.route("/api/typical-read", methods=["GET"])
+        def typical_read() -> Response:
+            """Typical read endpoint - simulates fetching data.
+
+            Simulates: Auth check + DB read + response serialization
+            Target: ~5-10ms response time
+            """
+            import time
+
+            # Simulate auth token validation (light CPU)
+            auth_header = request.headers.get("Authorization", "")
+            is_authenticated = len(auth_header) > 10
+
+            # Simulate database read (~5ms)
+            time.sleep(0.005)
+
+            # Build response
+            data = {
+                "id": "resource-123",
+                "name": "Example Resource",
+                "description": "This is a typical API response",
+                "metadata": {
+                    "createdAt": "2025-01-14T00:00:00Z",
+                    "updatedAt": "2025-01-14T12:00:00Z",
+                    "version": 1,
+                },
+                "authenticated": is_authenticated,
+                "timestamp": int(time.time() * 1000),
+            }
+
+            return Response(json.dumps(data), mimetype="application/json")
+
+        @self.app.route("/api/typical-write", methods=["POST"])
+        def typical_write() -> Response:
+            """Typical write endpoint - simulates creating/updating data.
+
+            Simulates: Validation + DB write + response
+            Target: ~15-25ms response time
+            """
+            import time
+
+            data = request.get_json(force=True, silent=True) or {}
+
+            # Input validation (light CPU)
+            name = data.get("name", "")
+            if len(name) < 1:
+                name = "default"
+
+            # Simulate database write (~15ms - writes are slower than reads)
+            time.sleep(0.015)
+
+            # Build response
+            response_data = {
+                "id": f"new-{int(time.time() * 1000)}",
+                "name": name,
+                "status": "created",
+                "timestamp": int(time.time() * 1000),
+            }
+
+            return Response(json.dumps(response_data), mimetype="application/json")
+
+    def start(self) -> dict[str, Any]:
+        """Start the test server in a background thread."""
+        import socket
+
+        from werkzeug.serving import make_server
+
+        # Find a free port if port is 0
+        if self.port == 0:
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.bind((self.host, 0))
+                self._actual_port = s.getsockname()[1]
+        else:
+            self._actual_port = self.port
+
+        self._server = make_server(self.host, self._actual_port, self.app, threaded=True)
+
+        def run_server():
+            self._server.serve_forever()
+
+        self._server_thread = threading.Thread(target=run_server, daemon=True)
+        self._server_thread.start()
+
+        url = f"http://{self.host}:{self._actual_port}"
+        print(f"Test server started at {url}")
+
+        return {
+            "port": self._actual_port,
+            "host": self.host,
+            "url": url,
+        }
+
+    def stop(self) -> None:
+        """Stop the test server."""
+        if hasattr(self, "_server"):
+            self._server.shutdown()
+        if self._server_thread:
+            self._server_thread.join(timeout=5)
+        print("Test server stopped")
+
+    def get_url(self) -> str:
+        """Get the server URL."""
+        return f"http://{self.host}:{self._actual_port}"
+
+
+if __name__ == "__main__":
+    # Quick test
+    server = TestServer()
+    info = server.start()
+    print(f"Server running at {info['url']}")
+    input("Press Enter to stop...")
+    server.stop()

From bcd4a067c1be343f84ae0c8caf6e6dc9818c7a34 Mon Sep 17 00:00:00 2001
From: JY Tan <jy8230@gmail.com>
Date: Mon, 19 Jan 2026 19:32:27 -0800
Subject: [PATCH 2/4] Some changes

---
 .github/workflows/benchmarks.yml     | 276 +++++++++++++++++++++++++++
 benchmarks/README.md                 |  33 ++++
 benchmarks/bench/common.py           |   7 +-
 benchmarks/bench/resource_monitor.py |  14 +-
 benchmarks/bench/result_utils.py     |   4 +-
 benchmarks/profile/profile.sh        |   8 +-
 benchmarks/profile/simple_profile.py |  85 +++++----
 benchmarks/run_benchmarks.sh         |   6 +
 benchmarks/scripts/compare_runs.py   | 177 +++++++++++++++++
 benchmarks/server/test_server.py     |  16 +-
 10 files changed, 564 insertions(+), 62 deletions(-)
 create mode 100644 .github/workflows/benchmarks.yml
 create mode 100755 benchmarks/scripts/compare_runs.py

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
new file mode 100644
index 0000000..7ff1749
--- /dev/null
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,276 @@
+name: Benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      iterations:
+        description: "Number of iterations for realistic workload benchmark"
+        required: false
+        default: "200"
+      qps_duration:
+        description: "Duration in seconds for each QPS level"
+        required: false
+        default: "10"
+      compare_with:
+        description: "Run ID to compare results against (optional)"
+        required: false
+        default: ""
+
+jobs:
+  benchmark:
+    name: Run Benchmarks
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+
+      - name: Setup Python
+        run: uv python install 3.9
+
+      - name: Cache uv + Python installs + venv
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/uv
+            ~/.local/share/uv/python
+            .venv
+          key: ${{ runner.os }}-uv-benchmark-3.9-${{ hashFiles('uv.lock') }}
+
+      - name: Install dependencies
+        run: |
+          uv sync --all-extras
+          uv pip install flask requests psutil
+
+      - name: Get system info
+        id: sysinfo
+        run: |
+          echo "python_version=$(python --version)" >> $GITHUB_OUTPUT
+          echo "os=$(uname -s)" >> $GITHUB_OUTPUT
+          echo "arch=$(uname -m)" >> $GITHUB_OUTPUT
+          echo "cpu_count=$(nproc)" >> $GITHUB_OUTPUT
+          echo "memory_gb=$(free -g | awk '/^Mem:/{print $2}')" >> $GITHUB_OUTPUT
+
+      - name: Run realistic workload benchmark
+        id: realistic
+        run: |
+          uv run python benchmarks/bench/realistic_workload.py 2>&1 | tee realistic_output.txt
+          # Extract just the results JSON
+          cat benchmarks/results/realistic-workload.json
+
+      - name: Run fixed QPS latency benchmark
+        id: fixed_qps
+        run: |
+          uv run python benchmarks/bench/fixed_qps_latency.py 2>&1 | tee fixed_qps_output.txt
+          # Extract just the results JSON
+          cat benchmarks/results/fixed-qps-latency.json
+
+      - name: Generate structured results
+        id: results
+        run: |
+          cat > benchmarks/results/benchmark-summary.json << 'EOF'
+          {
+            "metadata": {
+              "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
+              "run_id": "${{ github.run_id }}",
+              "run_number": "${{ github.run_number }}",
+              "commit_sha": "${{ github.sha }}",
+              "branch": "${{ github.ref_name }}",
+              "triggered_by": "${{ github.actor }}",
+              "environment": {
+                "python_version": "${{ steps.sysinfo.outputs.python_version }}",
+                "os": "${{ steps.sysinfo.outputs.os }}",
+                "arch": "${{ steps.sysinfo.outputs.arch }}",
+                "cpu_count": "${{ steps.sysinfo.outputs.cpu_count }}",
+                "memory_gb": "${{ steps.sysinfo.outputs.memory_gb }}"
+              }
+            }
+          }
+          EOF
+
+          # Create a proper JSON with jq
+          jq -n \
+            --slurpfile realistic benchmarks/results/realistic-workload.json \
+            --slurpfile fixed_qps benchmarks/results/fixed-qps-latency.json \
+            --arg timestamp "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+            --arg run_id "${{ github.run_id }}" \
+            --arg run_number "${{ github.run_number }}" \
+            --arg commit_sha "${{ github.sha }}" \
+            --arg branch "${{ github.ref_name }}" \
+            --arg triggered_by "${{ github.actor }}" \
+            --arg python_version "${{ steps.sysinfo.outputs.python_version }}" \
+            --arg os "${{ steps.sysinfo.outputs.os }}" \
+            --arg arch "${{ steps.sysinfo.outputs.arch }}" \
+            --arg cpu_count "${{ steps.sysinfo.outputs.cpu_count }}" \
+            --arg memory_gb "${{ steps.sysinfo.outputs.memory_gb }}" \
+            '{
+              metadata: {
+                timestamp: $timestamp,
+                run_id: $run_id,
+                run_number: ($run_number | tonumber),
+                commit_sha: $commit_sha,
+                branch: $branch,
+                triggered_by: $triggered_by,
+                environment: {
+                  python_version: $python_version,
+                  os: $os,
+                  arch: $arch,
+                  cpu_count: ($cpu_count | tonumber),
+                  memory_gb: ($memory_gb | tonumber)
+                }
+              },
+              realistic_workload: $realistic[0],
+              fixed_qps_latency: $fixed_qps[0]
+            }' > benchmarks/results/benchmark-summary.json
+
+      - name: Generate markdown summary
+        run: |
+          SUMMARY_FILE="benchmarks/results/benchmark-summary.md"
+
+          cat > "$SUMMARY_FILE" << EOF
+          # Benchmark Results
+
+          **Date**: $(date -u +%Y-%m-%d)
+          **Commit**: ${{ github.sha }}
+          **Branch**: ${{ github.ref_name }}
+          **Run ID**: ${{ github.run_id }}
+
+          ## Environment
+          - Python: ${{ steps.sysinfo.outputs.python_version }}
+          - OS: ${{ steps.sysinfo.outputs.os }} (${{ steps.sysinfo.outputs.arch }})
+          - CPUs: ${{ steps.sysinfo.outputs.cpu_count }}
+          - Memory: ${{ steps.sysinfo.outputs.memory_gb }} GB
+
+          ## Realistic Workload Results
+
+          EOF
+
+          # Parse and format realistic workload results
+          jq -r '
+            "| Endpoint | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |",
+            "|----------|----------|------------|----------|-----------|----------|",
+            (.comparison_100 | to_entries[] |
+              "| \(.key) | \(.value.baseline_mean_ms | . * 10 | round / 10)ms | \(.value.sdk_mean_ms | . * 10 | round / 10)ms | +\(.value.mean_overhead_ms | . * 10 | round / 10)ms (\(.value.mean_overhead_pct | round)%) | - | - |"
+            )
+          ' benchmarks/results/realistic-workload.json >> "$SUMMARY_FILE"
+
+          cat >> "$SUMMARY_FILE" << 'EOF'
+
+          ## Fixed QPS Latency Results
+
+          ### Mean Latency
+
+          EOF
+
+          jq -r '
+            "| QPS | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |",
+            "|-----|----------|------------|----------|-----------|----------|",
+            (.baseline | to_entries[] |
+              . as $b |
+              ($b.key | tostring) as $qps |
+              "| \($qps) | \($b.value.mean_ms | . * 10 | round / 10)ms | - | - | - | - |"
+            )
+          ' benchmarks/results/fixed-qps-latency.json >> "$SUMMARY_FILE"
+
+          cat >> "$SUMMARY_FILE" << 'EOF'
+
+          ---
+
+          📊 **Full results available in artifacts**
+
+          EOF
+
+          # Also write to GitHub step summary for UI display
+          cat "$SUMMARY_FILE" >> $GITHUB_STEP_SUMMARY
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results-${{ github.run_id }}
+          path: |
+            benchmarks/results/*.json
+            benchmarks/results/*.md
+            realistic_output.txt
+            fixed_qps_output.txt
+          retention-days: 90
+
+      - name: Download comparison results (if specified)
+        if: ${{ inputs.compare_with != '' }}
+        uses: actions/download-artifact@v4
+        with:
+          name: benchmark-results-${{ inputs.compare_with }}
+          path: benchmarks/results/comparison/
+        continue-on-error: true
+
+      - name: Compare with previous run
+        if: ${{ inputs.compare_with != '' }}
+        run: |
+          if [ -f benchmarks/results/comparison/benchmark-summary.json ]; then
+            echo "## Comparison with Run ${{ inputs.compare_with }}" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+
+            # Compare realistic workload results
+            PREV_READ=$(jq '.realistic_workload.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/comparison/benchmark-summary.json)
+            CURR_READ=$(jq '.realistic_workload.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/benchmark-summary.json)
+
+            PREV_WRITE=$(jq '.realistic_workload.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/comparison/benchmark-summary.json)
+            CURR_WRITE=$(jq '.realistic_workload.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/benchmark-summary.json)
+
+            echo "| Metric | Previous | Current | Delta |" >> $GITHUB_STEP_SUMMARY
+            echo "|--------|----------|---------|-------|" >> $GITHUB_STEP_SUMMARY
+            echo "| Read API overhead | ${PREV_READ}ms | ${CURR_READ}ms | $(echo "$CURR_READ - $PREV_READ" | bc)ms |" >> $GITHUB_STEP_SUMMARY
+            echo "| Write API overhead | ${PREV_WRITE}ms | ${CURR_WRITE}ms | $(echo "$CURR_WRITE - $PREV_WRITE" | bc)ms |" >> $GITHUB_STEP_SUMMARY
+          else
+            echo "⚠️ Could not find comparison results for run ${{ inputs.compare_with }}" >> $GITHUB_STEP_SUMMARY
+          fi
+
+      - name: Check for performance regression
+        id: regression
+        run: |
+          # Check if overhead exceeds threshold (3ms for 100% sampling)
+          THRESHOLD_MS=3.0
+
+          READ_OVERHEAD=$(jq '.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/realistic-workload.json)
+          WRITE_OVERHEAD=$(jq '.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/realistic-workload.json)
+          MIXED_OVERHEAD=$(jq '.comparison_100.realistic_mixed.mean_overhead_ms' benchmarks/results/realistic-workload.json)
+
+          REGRESSION=false
+
+          if (( $(echo "$READ_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then
+            echo "⚠️ Read API overhead ($READ_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY
+            REGRESSION=true
+          fi
+
+          if (( $(echo "$WRITE_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then
+            echo "⚠️ Write API overhead ($WRITE_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY
+            REGRESSION=true
+          fi
+
+          if (( $(echo "$MIXED_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then
+            echo "⚠️ Mixed API overhead ($MIXED_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY
+            REGRESSION=true
+          fi
+
+          if [ "$REGRESSION" = true ]; then
+            echo "" >> $GITHUB_STEP_SUMMARY
+            echo "### ⚠️ Performance regression detected" >> $GITHUB_STEP_SUMMARY
+            echo "regression=true" >> $GITHUB_OUTPUT
+          else
+            echo "" >> $GITHUB_STEP_SUMMARY
+            echo "### ✅ No performance regression detected" >> $GITHUB_STEP_SUMMARY
+            echo "regression=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Output JSON results
+        run: |
+          echo "### Structured Results (JSON)"
+          echo ""
+          echo '```json'
+          cat benchmarks/results/benchmark-summary.json
+          echo '```'
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 594982c..1489e9b 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -56,6 +56,36 @@ After running benchmarks, compare the results:
 python benchmarks/compare_benchmarks.py
 ```
 
+### Comparing Different Runs
+
+To compare results from different benchmark runs:
+
+```bash
+# Compare two result files
+python benchmarks/scripts/compare_runs.py results/old-results.json results/new-results.json
+
+# Output as JSON only (for programmatic use)
+python benchmarks/scripts/compare_runs.py --json results/old.json results/new.json
+```
+
+### GitHub Actions Workflow
+
+Benchmarks can be run via GitHub Actions for consistent, reproducible results:
+
+1. Go to **Actions** → **Benchmarks**
+2. Click **Run workflow**
+3. Optionally specify:
+   - `iterations`: Number of iterations for realistic workload (default: 200)
+   - `qps_duration`: Duration per QPS level in seconds (default: 10)
+   - `compare_with`: Run ID to compare against (optional)
+
+The workflow outputs:
+
+- Structured JSON with all results and metadata
+- Markdown summary in the workflow run
+- Artifacts containing full results (retained for 90 days)
+- Regression detection if overhead exceeds 3ms threshold
+
 ### Configuration
 
 You can configure benchmarks via environment variables:
@@ -151,11 +181,14 @@ benchmarks/
 │   ├── profile.sh             # Profiler runner script
 │   ├── simple_profile.py      # Profiling workload
 │   └── results/               # Profile output (gitignored)
+├── scripts/
+│   └── compare_runs.py        # Compare benchmark results across runs
 ├── server/
 │   └── test_server.py         # Flask test server
 ├── results/                   # JSON output (gitignored)
 ├── compare_benchmarks.py      # Result comparison script
 ├── run_benchmarks.sh          # Runner script
 ├── PROFILING.md               # Profiling documentation
+├── RESULTS.md                 # Historical results
 └── README.md
 ```
diff --git a/benchmarks/bench/common.py b/benchmarks/bench/common.py
index d68a156..76e331e 100644
--- a/benchmarks/bench/common.py
+++ b/benchmarks/bench/common.py
@@ -49,7 +49,12 @@ def run_benchmarks(
     """Run all benchmarks with the given label."""
     opts = {**DEFAULT_OPTIONS, **(options or {})}
 
-    enable_memory_tracking = os.environ.get("BENCHMARK_ENABLE_MEMORY", "true").lower() != "false"
+    # Check environment variable first, then fall back to options
+    env_memory = os.environ.get("BENCHMARK_ENABLE_MEMORY")
+    if env_memory is not None:
+        enable_memory_tracking = env_memory.lower() != "false"
+    else:
+        enable_memory_tracking = bool(opts.get("enable_memory_tracking", True))
 
     # Start test server
     server = TestServer()
diff --git a/benchmarks/bench/resource_monitor.py b/benchmarks/bench/resource_monitor.py
index 9174e16..8572d24 100644
--- a/benchmarks/bench/resource_monitor.py
+++ b/benchmarks/bench/resource_monitor.py
@@ -147,7 +147,13 @@ def _monitor_loop(self) -> None:
 
     def _collect_memory_sample(self) -> None:
         """Collect a memory usage sample."""
-        if not self._is_running or not self._current_task_stats:
+        if not self._is_running:
+            return
+
+        # Take a local snapshot to avoid race condition where another thread
+        # sets _current_task_stats to None between check and usage
+        task_stats = self._current_task_stats
+        if task_stats is None:
             return
 
         # Get RSS from resource module (in bytes on macOS, kilobytes on Linux)
@@ -169,9 +175,9 @@ def _collect_memory_sample(self) -> None:
         except (FileNotFoundError, PermissionError):
             pass  # Use ru_maxrss fallback
 
-        self._current_task_stats.rss_sum += rss
-        self._current_task_stats.rss_max = max(self._current_task_stats.rss_max, rss)
-        self._current_task_stats.sample_count += 1
+        task_stats.rss_sum += rss
+        task_stats.rss_max = max(task_stats.rss_max, rss)
+        task_stats.sample_count += 1
 
     def get_task_stats(self, task_name: str) -> TaskResourceStats | None:
         """Get resource statistics for a completed task."""
diff --git a/benchmarks/bench/result_utils.py b/benchmarks/bench/result_utils.py
index 1e4a2b6..d581ab6 100644
--- a/benchmarks/bench/result_utils.py
+++ b/benchmarks/bench/result_utils.py
@@ -8,7 +8,7 @@
 import statistics
 import uuid
 from dataclasses import dataclass, field
-from datetime import UTC, datetime
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 
@@ -189,7 +189,7 @@ def create_benchmark_result(
     return BenchmarkRunResult(
         id=str(uuid.uuid4()),
         label=label,
-        timestamp=datetime.now(UTC).isoformat(),
+        timestamp=datetime.now(timezone.utc).isoformat(),
         duration_ms=duration_ms,
         options=options,
         system=get_system_info(),
diff --git a/benchmarks/profile/profile.sh b/benchmarks/profile/profile.sh
index 6583ac9..84f3ca8 100755
--- a/benchmarks/profile/profile.sh
+++ b/benchmarks/profile/profile.sh
@@ -25,6 +25,10 @@ case "$PROFILER" in
             exit 1
         fi
         
+        # Capture resolved binary path before sudo (root's PATH may not include user pip bin)
+        PYSPY_BIN="$(command -v py-spy)"
+        PYTHON_BIN="$(command -v python)"
+        
         OUTPUT="$RESULTS_DIR/flamegraph_$(date +%Y%m%d_%H%M%S).svg"
         echo "Output: $OUTPUT"
         echo ""
@@ -32,9 +36,9 @@ case "$PROFILER" in
         # py-spy needs sudo on macOS for sampling
         if [[ "$OSTYPE" == "darwin"* ]]; then
             echo "Note: py-spy may require sudo on macOS"
-            sudo py-spy record -o "$OUTPUT" --rate 100 -- python benchmarks/profile/simple_profile.py
+            sudo "$PYSPY_BIN" record -o "$OUTPUT" --rate 100 -- "$PYTHON_BIN" benchmarks/profile/simple_profile.py
         else
-            py-spy record -o "$OUTPUT" --rate 100 -- python benchmarks/profile/simple_profile.py
+            "$PYSPY_BIN" record -o "$OUTPUT" --rate 100 -- "$PYTHON_BIN" benchmarks/profile/simple_profile.py
         fi
         
         echo ""
diff --git a/benchmarks/profile/simple_profile.py b/benchmarks/profile/simple_profile.py
index 22bfb1e..93f3b76 100644
--- a/benchmarks/profile/simple_profile.py
+++ b/benchmarks/profile/simple_profile.py
@@ -66,48 +66,49 @@ def main():
     # Use session for connection pooling
     session = requests.Session()
 
-    print(f"\nTest server started at {server_url}")
-    print("Running profiling workload...")
-
-    # Number of iterations (adjust for longer/shorter profiling)
-    iterations = 500
-
-    start_time = time.time()
-
-    for i in range(iterations):
-        # Mix of different request types
-        if i % 3 == 0:
-            # Simple GET
-            response = session.get(f"{server_url}/api/simple")
-            response.json()
-        elif i % 3 == 1:
-            # POST with JSON body
-            response = session.post(
-                f"{server_url}/api/echo",
-                json={"data": "test", "iteration": i},
-            )
-            response.json()
-        else:
-            # Sensitive data endpoint (triggers transform checks)
-            response = session.post(
-                f"{server_url}/api/auth/login",
-                json={"email": "test@example.com", "password": "secret123"},
-            )
-            response.json()
-
-        if (i + 1) % 100 == 0:
-            elapsed = time.time() - start_time
-            rate = (i + 1) / elapsed
-            print(f"  {i + 1}/{iterations} iterations ({rate:.1f} req/s)")
-
-    elapsed = time.time() - start_time
-    print(f"\nCompleted {iterations} iterations in {elapsed:.2f}s")
-    print(f"Average: {iterations / elapsed:.1f} req/s")
-
-    # Cleanup
-    session.close()
-    server.stop()
-    sdk.shutdown()
+    try:
+        print(f"\nTest server started at {server_url}")
+        print("Running profiling workload...")
+
+        # Number of iterations (adjust for longer/shorter profiling)
+        iterations = 500
+
+        start_time = time.time()
+
+        for i in range(iterations):
+            # Mix of different request types
+            if i % 3 == 0:
+                # Simple GET
+                response = session.get(f"{server_url}/api/simple")
+                response.json()
+            elif i % 3 == 1:
+                # POST with JSON body
+                response = session.post(
+                    f"{server_url}/api/echo",
+                    json={"data": "test", "iteration": i},
+                )
+                response.json()
+            else:
+                # Sensitive data endpoint (triggers transform checks)
+                response = session.post(
+                    f"{server_url}/api/auth/login",
+                    json={"email": "test@example.com", "password": "secret123"},
+                )
+                response.json()
+
+            if (i + 1) % 100 == 0:
+                elapsed = time.time() - start_time
+                rate = (i + 1) / elapsed
+                print(f"  {i + 1}/{iterations} iterations ({rate:.1f} req/s)")
+
+        elapsed = time.time() - start_time
+        print(f"\nCompleted {iterations} iterations in {elapsed:.2f}s")
+        print(f"Average: {iterations / elapsed:.1f} req/s")
+    finally:
+        # Cleanup - always execute even if an exception is raised
+        session.close()
+        server.stop()
+        sdk.shutdown()
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/run_benchmarks.sh b/benchmarks/run_benchmarks.sh
index 6367d74..1266f49 100755
--- a/benchmarks/run_benchmarks.sh
+++ b/benchmarks/run_benchmarks.sh
@@ -25,6 +25,12 @@ if ! python -c "import flask" 2>/dev/null; then
     uv pip install flask
 fi
 
+# Check for requests
+if ! python -c "import requests" 2>/dev/null; then
+    echo "Installing requests..."
+    uv pip install requests
+fi
+
 echo ""
 echo "Running SDK Disabled (baseline)..."
 echo "----------------------------------------------"
diff --git a/benchmarks/scripts/compare_runs.py b/benchmarks/scripts/compare_runs.py
new file mode 100755
index 0000000..18351ea
--- /dev/null
+++ b/benchmarks/scripts/compare_runs.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""Compare benchmark results from different runs.
+
+Usage:
+    python compare_runs.py results1.json results2.json
+    python compare_runs.py --download RUN_ID1 RUN_ID2  # Download from GitHub Actions
+"""
+
+import argparse
+import json
+
+
+def load_results(path: str) -> dict:
+    """Load benchmark results from JSON file."""
+    with open(path) as f:
+        return json.load(f)
+
+
+def compare_realistic_workload(old: dict, new: dict) -> list[dict]:
+    """Compare realistic workload results."""
+    comparisons = []
+
+    old_comp = old.get("comparison_100", old.get("realistic_workload", {}).get("comparison_100", {}))
+    new_comp = new.get("comparison_100", new.get("realistic_workload", {}).get("comparison_100", {}))
+
+    for endpoint in ["typical_read", "typical_write", "realistic_mixed"]:
+        if endpoint in old_comp and endpoint in new_comp:
+            old_overhead = old_comp[endpoint]["mean_overhead_ms"]
+            new_overhead = new_comp[endpoint]["mean_overhead_ms"]
+            delta = new_overhead - old_overhead
+            delta_pct = (delta / old_overhead * 100) if old_overhead != 0 else 0
+
+            comparisons.append(
+                {
+                    "endpoint": endpoint,
+                    "old_overhead_ms": round(old_overhead, 2),
+                    "new_overhead_ms": round(new_overhead, 2),
+                    "delta_ms": round(delta, 2),
+                    "delta_pct": round(delta_pct, 1),
+                    "regression": delta > 0.5,  # Flag if overhead increased by more than 0.5ms
+                }
+            )
+
+    return comparisons
+
+
+def compare_fixed_qps(old: dict, new: dict) -> list[dict]:
+    """Compare fixed QPS results."""
+    comparisons = []
+
+    old_baseline = old.get("baseline", old.get("fixed_qps_latency", {}).get("baseline", {}))
+    new_baseline = new.get("baseline", new.get("fixed_qps_latency", {}).get("baseline", {}))
+
+    old_sdk = old.get("sdk_100", old.get("fixed_qps_latency", {}).get("sdk_100", {}))
+    new_sdk = new.get("sdk_100", new.get("fixed_qps_latency", {}).get("sdk_100", {}))
+
+    for qps in ["25", "50", "75"]:
+        if qps in old_sdk and qps in new_sdk:
+            old_overhead = old_sdk[qps]["mean_ms"] - old_baseline.get(qps, {}).get("mean_ms", 0)
+            new_overhead = new_sdk[qps]["mean_ms"] - new_baseline.get(qps, {}).get("mean_ms", 0)
+            delta = new_overhead - old_overhead
+
+            comparisons.append(
+                {
+                    "qps": int(qps),
+                    "old_overhead_ms": round(old_overhead, 2),
+                    "new_overhead_ms": round(new_overhead, 2),
+                    "delta_ms": round(delta, 2),
+                    "regression": delta > 0.5,
+                }
+            )
+
+    return comparisons
+
+
+def print_comparison(old_path: str, new_path: str, old: dict, new: dict) -> None:
+    """Print comparison results."""
+    print("=" * 70)
+    print("Benchmark Comparison")
+    print("=" * 70)
+    print()
+    print(f"Old: {old_path}")
+    print(f"New: {new_path}")
+    print()
+
+    # Metadata comparison if available
+    old_meta = old.get("metadata", {})
+    new_meta = new.get("metadata", {})
+
+    if old_meta or new_meta:
+        print("### Metadata")
+        print(f"  Old run: {old_meta.get('run_id', 'N/A')} @ {old_meta.get('timestamp', 'N/A')}")
+        print(f"  New run: {new_meta.get('run_id', 'N/A')} @ {new_meta.get('timestamp', 'N/A')}")
+        print()
+
+    # Realistic workload comparison
+    print("### Realistic Workload (100% sampling)")
+    print()
+    print("| Endpoint | Old Overhead | New Overhead | Delta | Status |")
+    print("|----------|--------------|--------------|-------|--------|")
+
+    realistic_comps = compare_realistic_workload(old, new)
+    has_regression = False
+
+    for comp in realistic_comps:
+        status = "⚠️ REGRESSION" if comp["regression"] else "✅ OK"
+        if comp["regression"]:
+            has_regression = True
+        print(
+            f"| {comp['endpoint']} | {comp['old_overhead_ms']}ms | {comp['new_overhead_ms']}ms | {comp['delta_ms']:+.2f}ms | {status} |"
+        )
+
+    print()
+
+    # Fixed QPS comparison
+    print("### Fixed QPS Latency (100% sampling)")
+    print()
+    print("| QPS | Old Overhead | New Overhead | Delta | Status |")
+    print("|-----|--------------|--------------|-------|--------|")
+
+    qps_comps = compare_fixed_qps(old, new)
+
+    for comp in qps_comps:
+        status = "⚠️ REGRESSION" if comp["regression"] else "✅ OK"
+        if comp["regression"]:
+            has_regression = True
+        print(
+            f"| {comp['qps']} | {comp['old_overhead_ms']}ms | {comp['new_overhead_ms']}ms | {comp['delta_ms']:+.2f}ms | {status} |"
+        )
+
+    print()
+
+    if has_regression:
+        print("⚠️  REGRESSION DETECTED: Some metrics show increased overhead")
+    else:
+        print("✅ No regressions detected")
+
+    # Output as JSON for programmatic use
+    output = {
+        "realistic_workload": realistic_comps,
+        "fixed_qps": qps_comps,
+        "has_regression": has_regression,
+    }
+
+    print()
+    print("### JSON Output")
+    print(json.dumps(output, indent=2))
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare benchmark results")
+    parser.add_argument("old", help="Path to old results JSON")
+    parser.add_argument("new", help="Path to new results JSON")
+    parser.add_argument("--json", action="store_true", help="Output only JSON")
+
+    args = parser.parse_args()
+
+    old = load_results(args.old)
+    new = load_results(args.new)
+
+    if args.json:
+        realistic_comps = compare_realistic_workload(old, new)
+        qps_comps = compare_fixed_qps(old, new)
+        has_regression = any(c["regression"] for c in realistic_comps + qps_comps)
+
+        output = {
+            "realistic_workload": realistic_comps,
+            "fixed_qps": qps_comps,
+            "has_regression": has_regression,
+        }
+        print(json.dumps(output, indent=2))
+    else:
+        print_comparison(args.old, args.new, old, new)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/server/test_server.py b/benchmarks/server/test_server.py
index 5cdddb3..e747a50 100644
--- a/benchmarks/server/test_server.py
+++ b/benchmarks/server/test_server.py
@@ -369,19 +369,13 @@ def typical_write() -> Response:
 
     def start(self) -> dict[str, Any]:
         """Start the test server in a background thread."""
-        import socket
-
         from werkzeug.serving import make_server
 
-        # Find a free port if port is 0
-        if self.port == 0:
-            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-                s.bind((self.host, 0))
-                self._actual_port = s.getsockname()[1]
-        else:
-            self._actual_port = self.port
-
-        self._server = make_server(self.host, self._actual_port, self.app, threaded=True)
+        # Let make_server bind to port 0 to get an ephemeral port
+        # This avoids TOCTOU race where a separate socket finds a free port
+        # but another process binds to it before we can
+        self._server = make_server(self.host, self.port, self.app, threaded=True)
+        self._actual_port = self._server.server_port
 
         def run_server():
             self._server.serve_forever()

From 5d205509fba724e6c1107463327e9291fcf12623 Mon Sep 17 00:00:00 2001
From: JY Tan <jy8230@gmail.com>
Date: Mon, 19 Jan 2026 20:05:09 -0800
Subject: [PATCH 3/4] Fix

---
 .github/workflows/benchmarks.yml       | 4 ++++
 benchmarks/bench/fixed_qps_latency.py  | 4 ++--
 benchmarks/bench/realistic_workload.py | 4 ++--
 benchmarks/scripts/compare_runs.py     | 2 +-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 7ff1749..87af87b 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -59,6 +59,8 @@ jobs:
 
       - name: Run realistic workload benchmark
         id: realistic
+        env:
+          BENCHMARK_ITERATIONS: ${{ inputs.iterations }}
         run: |
           uv run python benchmarks/bench/realistic_workload.py 2>&1 | tee realistic_output.txt
           # Extract just the results JSON
@@ -66,6 +68,8 @@ jobs:
 
       - name: Run fixed QPS latency benchmark
         id: fixed_qps
+        env:
+          BENCHMARK_QPS_DURATION: ${{ inputs.qps_duration }}
         run: |
           uv run python benchmarks/bench/fixed_qps_latency.py 2>&1 | tee fixed_qps_output.txt
           # Extract just the results JSON
diff --git a/benchmarks/bench/fixed_qps_latency.py b/benchmarks/bench/fixed_qps_latency.py
index 2be9dda..58bc6f5 100644
--- a/benchmarks/bench/fixed_qps_latency.py
+++ b/benchmarks/bench/fixed_qps_latency.py
@@ -155,9 +155,9 @@ def main():
     print("Endpoint: POST /api/realistic (~10-15ms baseline)")
     print()
 
-    # Test at multiple QPS levels
+    # Test at multiple QPS levels (duration configurable via BENCHMARK_QPS_DURATION env var)
     qps_levels = [25, 50, 75]  # Requests per second
-    duration = 10  # seconds per test
+    duration = int(os.environ.get("BENCHMARK_QPS_DURATION", "10"))  # seconds per test
 
     results = {"baseline": {}, "sdk_100": {}, "sdk_10": {}}
 
diff --git a/benchmarks/bench/realistic_workload.py b/benchmarks/bench/realistic_workload.py
index 77b3c0e..725ec94 100644
--- a/benchmarks/bench/realistic_workload.py
+++ b/benchmarks/bench/realistic_workload.py
@@ -70,8 +70,8 @@ def run_benchmark_subprocess(mode: str, sampling_rate: float = 1.0) -> dict | No
     session.post(f"{{server_url}}/api/typical-write", json={{"name": "test"}})
     session.post(f"{{server_url}}/api/realistic", json={{"userId": "u1", "query": "test"}})
 
-# Benchmark parameters
-iterations = 200
+# Benchmark parameters (configurable via BENCHMARK_ITERATIONS env var)
+iterations = int(os.environ.get("BENCHMARK_ITERATIONS", "200"))
 results = {{}}
 
 # Test 1: Typical Read (~5-10ms baseline)
diff --git a/benchmarks/scripts/compare_runs.py b/benchmarks/scripts/compare_runs.py
index 18351ea..20ab534 100755
--- a/benchmarks/scripts/compare_runs.py
+++ b/benchmarks/scripts/compare_runs.py
@@ -3,7 +3,7 @@
 
 Usage:
     python compare_runs.py results1.json results2.json
-    python compare_runs.py --download RUN_ID1 RUN_ID2  # Download from GitHub Actions
+    python compare_runs.py --json results1.json results2.json  # JSON output only
 """
 
 import argparse

From 4f0dfac08f2a70260ae62876a32b4ab7f7c4bfc9 Mon Sep 17 00:00:00 2001
From: JY Tan <jy8230@gmail.com>
Date: Mon, 19 Jan 2026 20:09:57 -0800
Subject: [PATCH 4/4] Fix

---
 benchmarks/bench/fixed_qps_latency.py  | 5 ++++-
 benchmarks/bench/realistic_workload.py | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/benchmarks/bench/fixed_qps_latency.py b/benchmarks/bench/fixed_qps_latency.py
index 58bc6f5..b74a2d5 100644
--- a/benchmarks/bench/fixed_qps_latency.py
+++ b/benchmarks/bench/fixed_qps_latency.py
@@ -157,7 +157,10 @@ def main():
 
     # Test at multiple QPS levels (duration configurable via BENCHMARK_QPS_DURATION env var)
     qps_levels = [25, 50, 75]  # Requests per second
-    duration = int(os.environ.get("BENCHMARK_QPS_DURATION", "10"))  # seconds per test
+    try:
+        duration = int(os.environ.get("BENCHMARK_QPS_DURATION", "10"))
+    except ValueError:
+        duration = 10  # Fall back to default on invalid input
 
     results = {"baseline": {}, "sdk_100": {}, "sdk_10": {}}
 
diff --git a/benchmarks/bench/realistic_workload.py b/benchmarks/bench/realistic_workload.py
index 725ec94..7d7edf8 100644
--- a/benchmarks/bench/realistic_workload.py
+++ b/benchmarks/bench/realistic_workload.py
@@ -71,7 +71,10 @@ def run_benchmark_subprocess(mode: str, sampling_rate: float = 1.0) -> dict | No
     session.post(f"{{server_url}}/api/realistic", json={{"userId": "u1", "query": "test"}})
 
 # Benchmark parameters (configurable via BENCHMARK_ITERATIONS env var)
-iterations = int(os.environ.get("BENCHMARK_ITERATIONS", "200"))
+try:
+    iterations = int(os.environ.get("BENCHMARK_ITERATIONS", "200"))
+except ValueError:
+    iterations = 200  # Fall back to default on invalid input
 results = {{}}
 
 # Test 1: Typical Read (~5-10ms baseline)