From d57e914645e8c30b0f5a85a8036d804cda7d339a Mon Sep 17 00:00:00 2001 From: JY Tan Date: Wed, 14 Jan 2026 20:34:51 -0800 Subject: [PATCH 1/4] Commit --- benchmarks/.gitignore | 5 + benchmarks/PROFILING.md | 212 +++++++++ benchmarks/README.md | 161 +++++++ benchmarks/bench/__init__.py | 1 + benchmarks/bench/common.py | 227 ++++++++++ benchmarks/bench/fixed_qps_latency.py | 258 +++++++++++ benchmarks/bench/realistic_workload.py | 300 +++++++++++++ benchmarks/bench/resource_monitor.py | 202 +++++++++ benchmarks/bench/result_utils.py | 337 ++++++++++++++ benchmarks/bench/sdk_active.py | 49 ++ .../bench/sdk_active_with_transforms.py | 71 +++ benchmarks/bench/sdk_disabled.py | 16 + benchmarks/bench/sdk_sampling_rates.py | 108 +++++ benchmarks/compare_benchmarks.py | 275 ++++++++++++ benchmarks/profile/.gitignore | 2 + benchmarks/profile/profile.sh | 135 ++++++ benchmarks/profile/simple_profile.py | 114 +++++ benchmarks/run_benchmarks.sh | 47 ++ benchmarks/server/__init__.py | 1 + benchmarks/server/test_server.py | 420 ++++++++++++++++++ 20 files changed, 2941 insertions(+) create mode 100644 benchmarks/.gitignore create mode 100644 benchmarks/PROFILING.md create mode 100644 benchmarks/README.md create mode 100644 benchmarks/bench/__init__.py create mode 100644 benchmarks/bench/common.py create mode 100644 benchmarks/bench/fixed_qps_latency.py create mode 100644 benchmarks/bench/realistic_workload.py create mode 100644 benchmarks/bench/resource_monitor.py create mode 100644 benchmarks/bench/result_utils.py create mode 100644 benchmarks/bench/sdk_active.py create mode 100644 benchmarks/bench/sdk_active_with_transforms.py create mode 100644 benchmarks/bench/sdk_disabled.py create mode 100644 benchmarks/bench/sdk_sampling_rates.py create mode 100644 benchmarks/compare_benchmarks.py create mode 100644 benchmarks/profile/.gitignore create mode 100755 benchmarks/profile/profile.sh create mode 100644 benchmarks/profile/simple_profile.py create mode 100755 benchmarks/run_benchmarks.sh create mode 100644 benchmarks/server/__init__.py create mode 100644 benchmarks/server/test_server.py diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 0000000..260fbf0 --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1,5 @@ +# Benchmark results (regenerated each run) +results/ + +# Trace directories created during benchmarks +.benchmark-traces*/ diff --git a/benchmarks/PROFILING.md b/benchmarks/PROFILING.md new file mode 100644 index 0000000..2e19ea0 --- /dev/null +++ b/benchmarks/PROFILING.md @@ -0,0 +1,212 @@ +# Profiling the Drift Python SDK + +This document explains how to profile the SDK to understand where performance overhead comes from. + +## Quick Start + +```bash +cd /path/to/drift-python-sdk + +# Run cProfile (recommended starting point) +./benchmarks/profile/profile.sh cprofile +``` + +## Available Profilers + +### 1. cProfile (Built-in, Deterministic) + +**Best for:** Understanding call counts and cumulative time per function. + +```bash +./benchmarks/profile/profile.sh cprofile +``` + +This runs the profiling workload and outputs: + +- A `.prof` file in `benchmarks/profile/results/` +- Summary of top functions by cumulative and total time + +**View interactively with snakeviz:** + +```bash +pip install snakeviz +snakeviz benchmarks/profile/results/cprofile_*.prof +``` + +Opens an interactive sunburst diagram in your browser showing the call hierarchy. + +### 2. py-spy (Sampling, Flame Graphs) + +**Best for:** Low-overhead profiling and classic flame graph visualization. + +```bash +# Requires sudo on macOS +sudo ./benchmarks/profile/profile.sh pyspy +``` + +Generates an SVG flame graph that you can open in any browser. + +**Install:** + +```bash +pip install py-spy +# Or on macOS: brew install py-spy +``` + +### 3. Scalene (CPU + Memory) + +**Best for:** Understanding both CPU time and memory allocation per line. + +```bash +pip install scalene +./benchmarks/profile/profile.sh scalene +``` + +Generates an HTML report with line-by-line CPU and memory breakdown. + +### 4. VizTracer (Timeline) + +**Best for:** Seeing the sequence and timing of function calls over time. + +```bash +pip install viztracer +./benchmarks/profile/profile.sh viztracer +``` + +**View the trace:** + +```bash +vizviewer benchmarks/profile/results/viztracer_*.json +``` + +Opens a Chrome DevTools-style timeline visualization. + +## Profile Analysis Results + +Based on profiling 500 HTTP requests through the SDK: + +### Top Overhead Sources + +| Function | Time/Call | Description | +|----------|-----------|-------------| +| `span_serialization.clean_span_to_proto` | ~1.7ms | Converting spans to protobuf format | +| `td_span_processor.on_end` | ~2.1ms | Processing spans when they complete | +| `handler.finalize_wsgi_span` | ~2.2ms | Finalizing HTTP/WSGI spans | +| `otel_converter.otel_span_to_clean_span_data` | ~0.4ms | Converting OpenTelemetry spans | + +### Key Findings + +1. **Span serialization is the biggest bottleneck** + - `_dict_to_struct`, `_value_to_proto`, `_json_schema_to_proto` are called recursively + - Converting rich span data to protobuf is inherently expensive + +2. **The instrumentation itself is cheap** + - Function patching/wrapping adds minimal overhead + - Most time is spent in span processing, not in the hooks + +3. **Sampling reduces overhead proportionally** + - At 10% sampling, most requests skip span serialization entirely + - This explains why lower sampling rates dramatically improve performance + +### Optimization Opportunities + +Based on the profile data: + +1. Lazy serialization - Defer protobuf conversion until export time +2. Batch serialization - Serialize multiple spans together +3. Schema caching - Cache JSON schema conversions +4. Attribute filtering - Skip serializing large/unnecessary attributes + +## Custom Profiling + +### Profile a Specific Workload + +Edit `benchmarks/profile/simple_profile.py` to customize: + +```python +# Adjust number of iterations +iterations = 1000 + +# Change request mix +if i % 3 == 0: + # Your custom endpoint + response = session.get(f"{server_url}/your-endpoint") +``` + +### Profile with Different SDK Settings + +```python +# In simple_profile.py, modify SDK initialization: +sdk = TuskDrift.initialize( + api_key="profile-test-key", + env="profile", + sampling_rate=0.1, # Test with different sampling rates + transforms={...}, # Test with transforms enabled + log_level="warning", +) +``` + +### Profile Production Code + +You can use py-spy to attach to a running process: + +```bash +# Find your Python process PID +ps aux | grep python + +# Attach and record +sudo py-spy record -o profile.svg --pid --duration 30 +``` + +## Comparing Before/After Changes + +1. Run profile before changes: + + ```bash + ./benchmarks/profile/profile.sh cprofile + mv benchmarks/profile/results/cprofile_*.prof benchmarks/profile/results/before.prof + ``` + +2. Make your changes + +3. Run profile after changes: + + ```bash + ./benchmarks/profile/profile.sh cprofile + mv benchmarks/profile/results/cprofile_*.prof benchmarks/profile/results/after.prof + ``` + +4. Compare with pstats: + + ```python + import pstats + + before = pstats.Stats('benchmarks/profile/results/before.prof') + after = pstats.Stats('benchmarks/profile/results/after.prof') + + print("=== BEFORE ===") + before.strip_dirs().sort_stats('cumulative').print_stats(20) + + print("=== AFTER ===") + after.strip_dirs().sort_stats('cumulative').print_stats(20) + ``` + +## Output Files + +Profile results are saved to `benchmarks/profile/results/` (gitignored): + +| File | Description | +|------|-------------| +| `cprofile_*.prof` | cProfile binary data | +| `flamegraph_*.svg` | py-spy flame graph | +| `scalene_*.html` | Scalene HTML report | +| `viztracer_*.json` | VizTracer timeline data | +| `traces/` | SDK trace output during profiling | + +## Tips + +- **Start with cProfile** - It's built-in and gives good overview +- **Use snakeviz for exploration** - Interactive visualization helps find hotspots +- **Profile realistic workloads** - Micro-benchmarks may not reflect production patterns +- **Compare sampling rates** - Profile with 100% vs 10% sampling to see the difference +- **Watch for I/O** - File writes and network calls can dominate profiles diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..594982c --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,161 @@ +# Benchmarks + +These benchmarks measure the performance overhead of the Drift Python SDK. + +## Overview + +The benchmark suite runs a Flask test server and makes HTTP requests to various endpoints +while measuring latency, throughput, CPU usage, and memory consumption. Three configurations +are tested: + +1. SDK Disabled (baseline) - No SDK instrumentation +2. SDK Active - SDK in RECORD mode, capturing traces +3. SDK Active with Transforms - SDK with data transformation rules enabled + +## Usage + +### Prerequisites + +Make sure you have the required dependencies: + +```bash +cd /path/to/drift-python-sdk +uv pip install psutil flask requests +``` + +### Running All Benchmarks + +```bash +# Run all benchmarks and compare results +./run_benchmarks.sh +``` + +### Running Individual Benchmarks + +```bash +# Realistic workload benchmark (recommended) +python benchmarks/bench/realistic_workload.py + +# Fixed QPS latency test (measures latency at controlled request rates) +python benchmarks/bench/fixed_qps_latency.py + +# Synthetic benchmarks (stress tests) +python benchmarks/bench/sdk_disabled.py +python benchmarks/bench/sdk_active.py +python benchmarks/bench/sdk_active_with_transforms.py + +# Sampling rate comparison +python benchmarks/bench/sdk_sampling_rates.py +``` + +### Comparing Results + +After running benchmarks, compare the results: + +```bash +python benchmarks/compare_benchmarks.py +``` + +### Configuration + +You can configure benchmarks via environment variables: + +- `BENCHMARK_ENABLE_MEMORY=false` - Disable memory monitoring (reduces CPU overhead) + +Or modify the options in `common.py`: + +```python +DEFAULT_OPTIONS = { + "time_per_task_ms": 10_000, # Duration per task (10 seconds default) + "warmup_iterations": 5, # Warmup iterations before measurement + "enable_memory_tracking": True, +} +``` + +## Benchmark Tasks + +### Realistic Workloads (Recommended) + +These endpoints simulate production API behavior: + +- **GET /api/typical-read** (~5-10ms): Auth check + DB read + response serialization +- **POST /api/typical-write** (~15-25ms): Validation + DB write + response +- **POST /api/realistic** (~10-20ms): Validation + DB query + data processing + response + +### Synthetic Workloads + +These are stress tests, not representative of production: + +- **POST /api/compute-hash**: Pure CPU (iterative SHA-256) - useful for profiling +- **POST /api/io-bound**: Pure I/O (sleep delays) - tests baseline overhead +- **POST /api/auth/login, /api/users**: Sensitive data for transform testing + +## Output + +Results are saved to `benchmarks/results/` as JSON files: + +- `sdk-disabled.json` - Baseline results +- `sdk-active.json` - SDK enabled results +- `sdk-active-with-transforms.json` - SDK with transforms results + +The comparison script outputs a markdown table showing: + +- Throughput delta (negative = worse) +- Tail latency (p99) delta (positive = worse) +- CPU usage delta +- Memory overhead + +## Interpreting Results + +- **Throughput $\Delta$**: Percentage change in operations/second. Negative means slower. +- **Tail Latency $\Delta$**: Percentage change in p99 latency. Positive means slower. +- **CPU User $\Delta$**: Change in user-space CPU percentage. +- **Memory $\Delta$**: Additional memory used by the SDK. + +Ideally, the SDK should have minimal impact: + +- Throughput should be within ±5% +- Tail latency increase should be <10% +- Memory overhead should be reasonable (<50MB) + +## Profiling + +For detailed profiling to understand where SDK overhead comes from, see **[PROFILING.md](./PROFILING.md)**. + +Quick start: + +```bash +# Run cProfile analysis +./benchmarks/profile/profile.sh cprofile + +# View interactively +pip install snakeviz +snakeviz benchmarks/profile/results/cprofile_*.prof +``` + +## Architecture + +```text +benchmarks/ +├── bench/ +│ ├── common.py # Shared benchmark logic +│ ├── fixed_qps_latency.py # Fixed QPS latency test +│ ├── realistic_workload.py # Realistic API workload benchmark +│ ├── resource_monitor.py # CPU/memory monitoring +│ ├── result_utils.py # Result serialization +│ ├── sdk_disabled.py # Baseline benchmark (synthetic) +│ ├── sdk_active.py # SDK active benchmark (synthetic) +│ ├── sdk_active_with_transforms.py +│ └── sdk_sampling_rates.py # Sampling rate impact benchmark +├── profile/ +│ ├── profile.sh # Profiler runner script +│ ├── simple_profile.py # Profiling workload +│ └── results/ # Profile output (gitignored) +├── server/ +│ └── test_server.py # Flask test server +├── results/ # JSON output (gitignored) +├── compare_benchmarks.py # Result comparison script +├── run_benchmarks.sh # Runner script +├── PROFILING.md # Profiling documentation +└── README.md +``` diff --git a/benchmarks/bench/__init__.py b/benchmarks/bench/__init__.py new file mode 100644 index 0000000..854b623 --- /dev/null +++ b/benchmarks/bench/__init__.py @@ -0,0 +1 @@ +# Benchmark utilities diff --git a/benchmarks/bench/common.py b/benchmarks/bench/common.py new file mode 100644 index 0000000..d68a156 --- /dev/null +++ b/benchmarks/bench/common.py @@ -0,0 +1,227 @@ +"""Common benchmark runner logic.""" + +from __future__ import annotations + +import os +import sys +import time +from collections.abc import Callable +from typing import Any + +import requests + +# Add parent to path for imports +sys.path.insert(0, str(__file__).rsplit("/", 3)[0]) + +from benchmarks.bench.resource_monitor import ResourceMonitor +from benchmarks.bench.result_utils import ( + create_benchmark_result, + create_task_result, + persist_result, +) +from benchmarks.server.test_server import TestServer + +# Default benchmark options +DEFAULT_OPTIONS = { + "time_per_task_ms": 10_000, # Run each task for 10 seconds + "warmup_iterations": 5, + "enable_memory_tracking": True, +} + + +class BenchmarkTask: + """A single benchmark task.""" + + def __init__( + self, + name: str, + func: Callable[[], Any], + ): + self.name = name + self.func = func + self.samples_ns: list[float] = [] + + +def run_benchmarks( + label: str = "benchmark", + options: dict[str, Any] | None = None, +) -> None: + """Run all benchmarks with the given label.""" + opts = {**DEFAULT_OPTIONS, **(options or {})} + + enable_memory_tracking = os.environ.get("BENCHMARK_ENABLE_MEMORY", "true").lower() != "false" + + # Start test server + server = TestServer() + server_info = server.start() + server_url = server_info["url"] + + # Wait for server to be ready + _wait_for_server(server_url) + + # Initialize resource monitor + resource_monitor = ResourceMonitor( + interval_ms=100, + enable_memory_tracking=enable_memory_tracking, + ) + + # Define benchmark tasks + tasks = _create_tasks(server_url, label) + + print(f"\n{'=' * 60}") + print(f"Running benchmarks: {label}") + print(f"{'=' * 60}\n") + + benchmark_start = time.time() + resource_monitor.start() + + task_results = [] + for task in tasks: + print(f"Running: {task.name}") + resource_monitor.start_task(task.name) + + # Warmup + for _ in range(opts["warmup_iterations"]): + task.func() + + # Actual benchmark + task_start = time.time() + while (time.time() - task_start) * 1000 < opts["time_per_task_ms"]: + iter_start = time.perf_counter_ns() + task.func() + iter_end = time.perf_counter_ns() + task.samples_ns.append(iter_end - iter_start) + + resource_monitor.end_task() + + # Create result for this task + task_result = create_task_result(task.name, task.samples_ns, resource_monitor) + task_results.append(task_result) + + # Print summary + if task_result.latency.mean: + mean_ms = task_result.latency.mean / 1_000_000 + print(f" Samples: {task_result.samples}, Mean: {mean_ms:.2f}ms") + if task_result.throughput.mean: + print(f" Throughput: {task_result.throughput.mean:.1f} ops/s") + print() + + resource_monitor.stop() + benchmark_duration_ms = (time.time() - benchmark_start) * 1000 + + # Create and save result + result = create_benchmark_result( + label=label, + tasks=task_results, + duration_ms=benchmark_duration_ms, + options=opts, + ) + + output_path = persist_result(result) + print(f"\n{'=' * 60}") + print("Benchmark complete!") + print(f"Results saved to: {output_path}") + print(f"Total duration: {benchmark_duration_ms / 1000:.1f}s") + print(f"{'=' * 60}\n") + + # Stop server + server.stop() + + +def _wait_for_server(url: str, timeout: float = 10.0) -> None: + """Wait for the server to be ready.""" + start = time.time() + while time.time() - start < timeout: + try: + response = requests.get(f"{url}/health", timeout=1) + if response.status_code == 200: + return + except requests.RequestException: + pass + time.sleep(0.1) + raise RuntimeError(f"Server at {url} did not become ready in {timeout}s") + + +def _create_tasks(server_url: str, test_name: str) -> list[BenchmarkTask]: + """Create benchmark tasks.""" + tasks = [] + + # Use a session for connection pooling to avoid port exhaustion + session = requests.Session() + + # Task 1: High CPU (compute hash) + def cpu_bound(): + response = session.post( + f"{server_url}/api/compute-hash", + json={"data": "sensitive-data-to-hash", "iterations": 1000}, + timeout=30, + ) + response.raise_for_status() + return response.json() + + tasks.append( + BenchmarkTask( + name=f"High CPU: POST /api/compute-hash ({test_name})", + func=cpu_bound, + ) + ) + + # Task 2: High IO, Low CPU + def io_bound(): + response = session.post( + f"{server_url}/api/io-bound", + json={"jobs": 5, "delayMs": 5}, + timeout=30, + ) + response.raise_for_status() + return response.json() + + tasks.append( + BenchmarkTask( + name=f"High IO, Low CPU: POST /api/io-bound ({test_name})", + func=io_bound, + ) + ) + + # Task 3: Transform endpoints (cycling through sensitive data endpoints) + transform_endpoints = [ + { + "path": "/api/auth/login", + "body": {"email": "user@example.com", "password": "super-secret-password-123"}, + }, + { + "path": "/api/users", + "body": { + "username": "testuser", + "email": "test@example.com", + "ssn": "123-45-6789", + "creditCard": "4111-1111-1111-1111", + }, + }, + ] + endpoint_index = [0] # Use list to allow mutation in closure + + def transform_task(): + endpoint = transform_endpoints[endpoint_index[0] % len(transform_endpoints)] + endpoint_index[0] += 1 + response = session.post( + f"{server_url}{endpoint['path']}", + json=endpoint["body"], + timeout=30, + ) + response.raise_for_status() + return response.json() + + tasks.append( + BenchmarkTask( + name=f"Transform endpoints ({test_name})", + func=transform_task, + ) + ) + + return tasks + + +if __name__ == "__main__": + # Quick test run + run_benchmarks(label="test-run", options={"time_per_task_ms": 3000}) diff --git a/benchmarks/bench/fixed_qps_latency.py b/benchmarks/bench/fixed_qps_latency.py new file mode 100644 index 0000000..2be9dda --- /dev/null +++ b/benchmarks/bench/fixed_qps_latency.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +"""Benchmark: Fixed QPS Latency Test. + +This benchmark measures latency at fixed request rates (QPS) to show +how the SDK affects response times under controlled load. + +Unlike throughput tests that blast requests as fast as possible, this +sends requests at a steady rate to measure realistic latency impact. +""" + +import json +import os +import subprocess +import sys +from pathlib import Path + + +def run_fixed_qps_benchmark( + mode: str, target_qps: int, duration_seconds: int = 10, sampling_rate: float = 1.0 +) -> dict | None: + """Run benchmark at a fixed QPS in a subprocess.""" + + script = f''' +import os +import sys +import time +import shutil +import statistics +import threading +from pathlib import Path + +os.environ["TUSK_DRIFT_MODE"] = "{mode}" +sys.path.insert(0, "{os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))}") + +import requests + +TRACE_DIR = Path("{Path(__file__).parent.parent / ".benchmark-traces-qps"}") +if TRACE_DIR.exists(): + shutil.rmtree(TRACE_DIR) + +if "{mode}" == "RECORD": + from drift import TuskDrift + from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter + + sdk = TuskDrift.initialize( + api_key="benchmark-test-key", + env="benchmark", + sampling_rate={sampling_rate}, + log_level="warn", + ) + if sdk.span_exporter: + sdk.span_exporter.clear_adapters() + adapter = FilesystemSpanAdapter(base_directory=TRACE_DIR) + sdk.span_exporter.add_adapter(adapter) + sdk.mark_app_as_ready() + +from benchmarks.server.test_server import TestServer + +# Start server +server = TestServer() +server_info = server.start() +server_url = server_info["url"] +time.sleep(0.5) + +session = requests.Session() + +# Warmup +for _ in range(20): + session.post(f"{{server_url}}/api/realistic", json={{"userId": "warmup", "query": "test"}}) + +# Fixed QPS parameters +target_qps = {target_qps} +duration_seconds = {duration_seconds} +interval = 1.0 / target_qps # Time between requests + +latencies = [] +start_time = time.perf_counter() +request_count = 0 +target_requests = target_qps * duration_seconds + +while request_count < target_requests: + # Calculate when this request should start + expected_start = start_time + (request_count * interval) + now = time.perf_counter() + + # Wait if we're ahead of schedule + if now < expected_start: + time.sleep(expected_start - now) + + # Make request and measure latency + req_start = time.perf_counter_ns() + response = session.post( + f"{{server_url}}/api/realistic", + json={{"userId": f"user-{{request_count}}", "query": "search query", "email": "test@example.com"}}, + ) + response.json() + latencies.append(time.perf_counter_ns() - req_start) + request_count += 1 + +elapsed = time.perf_counter() - start_time +actual_qps = request_count / elapsed + +# Calculate statistics +sorted_latencies = sorted(latencies) +results = {{ + "target_qps": target_qps, + "actual_qps": round(actual_qps, 1), + "total_requests": request_count, + "duration_seconds": round(elapsed, 2), + "mean_ms": statistics.mean(latencies) / 1_000_000, + "p50_ms": sorted_latencies[len(sorted_latencies) // 2] / 1_000_000, + "p90_ms": sorted_latencies[int(len(sorted_latencies) * 0.90)] / 1_000_000, + "p99_ms": sorted_latencies[int(len(sorted_latencies) * 0.99)] / 1_000_000, + "min_ms": min(latencies) / 1_000_000, + "max_ms": max(latencies) / 1_000_000, +}} + +session.close() +server.stop() + +if "{mode}" == "RECORD": + sdk.shutdown() + +import json +print("RESULTS_JSON:" + json.dumps(results)) +''' + + result = subprocess.run( + [sys.executable, "-c", script], + capture_output=True, + text=True, + cwd=os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + timeout=180, + ) + + if result.returncode != 0: + print(f"Error running benchmark ({mode}, {target_qps} QPS):") + print(result.stderr[-1000:]) + return None + + for line in result.stdout.split("\n"): + if line.startswith("RESULTS_JSON:"): + return json.loads(line.replace("RESULTS_JSON:", "")) + + print("Could not find results in output") + return None + + +def main(): + print("=" * 70) + print("Fixed QPS Latency Benchmark") + print("=" * 70) + print() + print("This test sends requests at a fixed rate to measure latency impact.") + print("Endpoint: POST /api/realistic (~10-15ms baseline)") + print() + + # Test at multiple QPS levels + qps_levels = [25, 50, 75] # Requests per second + duration = 10 # seconds per test + + results = {"baseline": {}, "sdk_100": {}, "sdk_10": {}} + + for qps in qps_levels: + print(f"Testing at {qps} QPS...") + + # Baseline + print(" Running baseline (SDK disabled)...") + baseline = run_fixed_qps_benchmark("DISABLED", qps, duration) + if baseline: + results["baseline"][qps] = baseline + print(f" Mean: {baseline['mean_ms']:.1f}ms, p99: {baseline['p99_ms']:.1f}ms") + + # SDK 100% + print(" Running SDK (100% sampling)...") + sdk_100 = run_fixed_qps_benchmark("RECORD", qps, duration, sampling_rate=1.0) + if sdk_100: + results["sdk_100"][qps] = sdk_100 + print(f" Mean: {sdk_100['mean_ms']:.1f}ms, p99: {sdk_100['p99_ms']:.1f}ms") + + # SDK 10% + print(" Running SDK (10% sampling)...") + sdk_10 = run_fixed_qps_benchmark("RECORD", qps, duration, sampling_rate=0.1) + if sdk_10: + results["sdk_10"][qps] = sdk_10 + print(f" Mean: {sdk_10['mean_ms']:.1f}ms, p99: {sdk_10['p99_ms']:.1f}ms") + + print() + + # Print summary table + print("=" * 70) + print("Results Summary: Latency at Fixed QPS") + print("=" * 70) + print() + + print("### Mean Latency (ms)") + print() + print("| QPS | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |") + print("|-----|----------|------------|----------|-----------|----------|") + + for qps in qps_levels: + b = results["baseline"].get(qps, {}) + s100 = results["sdk_100"].get(qps, {}) + s10 = results["sdk_10"].get(qps, {}) + + if b and s100 and s10: + b_mean = b["mean_ms"] + s100_mean = s100["mean_ms"] + s10_mean = s10["mean_ms"] + + overhead_100 = s100_mean - b_mean + overhead_10 = s10_mean - b_mean + pct_100 = (overhead_100 / b_mean) * 100 if b_mean > 0 else 0 + pct_10 = (overhead_10 / b_mean) * 100 if b_mean > 0 else 0 + + print( + f"| {qps} | {b_mean:.1f}ms | {s100_mean:.1f}ms | +{overhead_100:.1f}ms ({pct_100:+.0f}%) | {s10_mean:.1f}ms | +{overhead_10:.1f}ms ({pct_10:+.0f}%) |" + ) + + print() + print("### P99 Latency (ms)") + print() + print("| QPS | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |") + print("|-----|----------|------------|----------|-----------|----------|") + + for qps in qps_levels: + b = results["baseline"].get(qps, {}) + s100 = results["sdk_100"].get(qps, {}) + s10 = results["sdk_10"].get(qps, {}) + + if b and s100 and s10: + b_p99 = b["p99_ms"] + s100_p99 = s100["p99_ms"] + s10_p99 = s10["p99_ms"] + + overhead_100 = s100_p99 - b_p99 + overhead_10 = s10_p99 - b_p99 + pct_100 = (overhead_100 / b_p99) * 100 if b_p99 > 0 else 0 + pct_10 = (overhead_10 / b_p99) * 100 if b_p99 > 0 else 0 + + print( + f"| {qps} | {b_p99:.1f}ms | {s100_p99:.1f}ms | +{overhead_100:.1f}ms ({pct_100:+.0f}%) | {s10_p99:.1f}ms | +{overhead_10:.1f}ms ({pct_10:+.0f}%) |" + ) + + print() + + # Save results + results_dir = Path(__file__).parent.parent / "results" + results_dir.mkdir(exist_ok=True) + + with open(results_dir / "fixed-qps-latency.json", "w") as f: + json.dump(results, f, indent=2) + + print(f"Results saved to {results_dir / 'fixed-qps-latency.json'}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench/realistic_workload.py b/benchmarks/bench/realistic_workload.py new file mode 100644 index 0000000..77b3c0e --- /dev/null +++ b/benchmarks/bench/realistic_workload.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +"""Benchmark: Realistic workload comparison. + +This benchmark tests SDK overhead on realistic API workloads: +1. Typical read endpoint (~5-10ms baseline) +2. Typical write endpoint (~15-25ms baseline) +3. Realistic mixed workload (~10-20ms baseline) + +These simulate production APIs that do actual work (I/O, validation, processing). +""" + +import json +import os +import subprocess +import sys +from pathlib import Path + + +def run_benchmark_subprocess(mode: str, sampling_rate: float = 1.0) -> dict | None: + """Run benchmark in a subprocess to ensure clean SDK state.""" + + script = f''' +import os +import sys +import time +import shutil +import statistics +from pathlib import Path + +# Configure SDK mode +os.environ["TUSK_DRIFT_MODE"] = "{mode}" +sys.path.insert(0, "{os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))}") + +import requests + +TRACE_DIR = Path("{Path(__file__).parent.parent / ".benchmark-traces-realistic"}") +if TRACE_DIR.exists(): + shutil.rmtree(TRACE_DIR) + +if "{mode}" == "RECORD": + from drift import TuskDrift + from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter + + sdk = TuskDrift.initialize( + api_key="benchmark-test-key", + env="benchmark", + sampling_rate={sampling_rate}, + log_level="warn", + ) + if sdk.span_exporter: + sdk.span_exporter.clear_adapters() + adapter = FilesystemSpanAdapter(base_directory=TRACE_DIR) + sdk.span_exporter.add_adapter(adapter) + sdk.mark_app_as_ready() + +from benchmarks.server.test_server import TestServer + +# Start server +server = TestServer() +server_info = server.start() +server_url = server_info["url"] + +# Wait for server +time.sleep(0.5) +session = requests.Session() + +# Warmup +for _ in range(10): + session.get(f"{{server_url}}/api/typical-read") + session.post(f"{{server_url}}/api/typical-write", json={{"name": "test"}}) + session.post(f"{{server_url}}/api/realistic", json={{"userId": "u1", "query": "test"}}) + +# Benchmark parameters +iterations = 200 +results = {{}} + +# Test 1: Typical Read (~5-10ms baseline) +latencies = [] +for _ in range(iterations): + start = time.perf_counter_ns() + response = session.get(f"{{server_url}}/api/typical-read") + response.json() + latencies.append(time.perf_counter_ns() - start) + +results["typical_read"] = {{ + "mean_ms": statistics.mean(latencies) / 1_000_000, + "p50_ms": statistics.median(latencies) / 1_000_000, + "p99_ms": sorted(latencies)[int(len(latencies) * 0.99)] / 1_000_000, + "samples": len(latencies), +}} + +# Test 2: Typical Write (~15-25ms baseline) +latencies = [] +for i in range(iterations): + start = time.perf_counter_ns() + response = session.post(f"{{server_url}}/api/typical-write", json={{"name": f"item-{{i}}"}}) + response.json() + latencies.append(time.perf_counter_ns() - start) + +results["typical_write"] = {{ + "mean_ms": statistics.mean(latencies) / 1_000_000, + "p50_ms": statistics.median(latencies) / 1_000_000, + "p99_ms": sorted(latencies)[int(len(latencies) * 0.99)] / 1_000_000, + "samples": len(latencies), +}} + +# Test 3: Realistic Mixed (~10-20ms baseline) +latencies = [] +for i in range(iterations): + start = time.perf_counter_ns() + response = session.post( + f"{{server_url}}/api/realistic", + json={{"userId": f"user-{{i}}", "query": "search query", "email": "test@example.com"}}, + ) + response.json() + latencies.append(time.perf_counter_ns() - start) + +results["realistic_mixed"] = {{ + "mean_ms": statistics.mean(latencies) / 1_000_000, + "p50_ms": statistics.median(latencies) / 1_000_000, + "p99_ms": sorted(latencies)[int(len(latencies) * 0.99)] / 1_000_000, + "samples": len(latencies), +}} + +# Cleanup +session.close() +server.stop() + +if "{mode}" == "RECORD": + sdk.shutdown() + +import json +print("RESULTS_JSON:" + json.dumps(results)) +''' + + result = subprocess.run( + [sys.executable, "-c", script], + capture_output=True, + text=True, + cwd=os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + timeout=120, + ) + + if result.returncode != 0: + print(f"Error running benchmark ({mode}):") + print(result.stderr) + return None + + # Extract results from output + for line in result.stdout.split("\n"): + if line.startswith("RESULTS_JSON:"): + return json.loads(line.replace("RESULTS_JSON:", "")) + + print(f"Could not find results in output for {mode}") + print(result.stdout[-500:]) + return None + + +def format_comparison(baseline: dict, sdk_on: dict) -> dict: + """Calculate overhead percentages.""" + comparison = {} + for key in baseline: + b_mean = baseline[key]["mean_ms"] + s_mean = sdk_on[key]["mean_ms"] + b_p99 = baseline[key]["p99_ms"] + s_p99 = sdk_on[key]["p99_ms"] + + comparison[key] = { + "baseline_mean_ms": b_mean, + "sdk_mean_ms": s_mean, + "mean_overhead_ms": s_mean - b_mean, + "mean_overhead_pct": ((s_mean - b_mean) / b_mean) * 100 if b_mean > 0 else 0, + "baseline_p99_ms": b_p99, + "sdk_p99_ms": s_p99, + "p99_overhead_ms": s_p99 - b_p99, + "p99_overhead_pct": ((s_p99 - b_p99) / b_p99) * 100 if b_p99 > 0 else 0, + } + return comparison + + +def main(): + print("=" * 60) + print("Realistic Workload Benchmark") + print("=" * 60) + print() + + # Run baseline (SDK disabled) + print("Running baseline (SDK disabled)...") + baseline = run_benchmark_subprocess("DISABLED") + if not baseline: + print("Failed to run baseline benchmark") + return + + print( + f" Typical Read: {baseline['typical_read']['mean_ms']:.2f}ms mean, {baseline['typical_read']['p99_ms']:.2f}ms p99" + ) + print( + f" Typical Write: {baseline['typical_write']['mean_ms']:.2f}ms mean, {baseline['typical_write']['p99_ms']:.2f}ms p99" + ) + print( + f" Realistic Mixed: {baseline['realistic_mixed']['mean_ms']:.2f}ms mean, {baseline['realistic_mixed']['p99_ms']:.2f}ms p99" + ) + print() + + # Run with SDK (100% sampling) + print("Running with SDK (100% sampling)...") + sdk_100 = run_benchmark_subprocess("RECORD", sampling_rate=1.0) + if not sdk_100: + print("Failed to run SDK benchmark") + return + + print( + f" Typical Read: {sdk_100['typical_read']['mean_ms']:.2f}ms mean, {sdk_100['typical_read']['p99_ms']:.2f}ms p99" + ) + print( + f" Typical Write: {sdk_100['typical_write']['mean_ms']:.2f}ms mean, {sdk_100['typical_write']['p99_ms']:.2f}ms p99" + ) + print( + f" Realistic Mixed: {sdk_100['realistic_mixed']['mean_ms']:.2f}ms mean, {sdk_100['realistic_mixed']['p99_ms']:.2f}ms p99" + ) + print() + + # Run with SDK (10% sampling) + print("Running with SDK (10% sampling)...") + sdk_10 = run_benchmark_subprocess("RECORD", sampling_rate=0.1) + if not sdk_10: + print("Failed to run SDK 10% benchmark") + return + + print( + f" Typical Read: {sdk_10['typical_read']['mean_ms']:.2f}ms mean, {sdk_10['typical_read']['p99_ms']:.2f}ms p99" + ) + print( + f" Typical Write: {sdk_10['typical_write']['mean_ms']:.2f}ms mean, {sdk_10['typical_write']['p99_ms']:.2f}ms p99" + ) + print( + f" Realistic Mixed: {sdk_10['realistic_mixed']['mean_ms']:.2f}ms mean, {sdk_10['realistic_mixed']['p99_ms']:.2f}ms p99" + ) + print() + + # Calculate and display comparisons + comparison_100 = format_comparison(baseline, sdk_100) + comparison_10 = format_comparison(baseline, sdk_10) + + print("=" * 60) + print("Results Summary") + print("=" * 60) + print() + + print("## SDK Overhead on Realistic Workloads") + print() + print("| Endpoint | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |") + print("|----------|----------|------------|----------|-----------|----------|") + + for key, label in [ + ("typical_read", "Read API (~5ms)"), + ("typical_write", "Write API (~15ms)"), + ("realistic_mixed", "Mixed API (~10ms)"), + ]: + c100 = comparison_100[key] + c10 = comparison_10[key] + print( + f"| {label} | {c100['baseline_mean_ms']:.1f}ms | {c100['sdk_mean_ms']:.1f}ms | +{c100['mean_overhead_ms']:.1f}ms ({c100['mean_overhead_pct']:+.1f}%) | {c10['sdk_mean_ms']:.1f}ms | +{c10['mean_overhead_ms']:.1f}ms ({c10['mean_overhead_pct']:+.1f}%) |" + ) + + print() + print("## P99 Latency Impact") + print() + print("| Endpoint | Baseline p99 | SDK (100%) p99 | SDK (10%) p99 |") + print("|----------|--------------|----------------|---------------|") + + for key, label in [("typical_read", "Read API"), ("typical_write", "Write API"), ("realistic_mixed", "Mixed API")]: + c100 = comparison_100[key] + c10 = comparison_10[key] + print( + f"| {label} | {c100['baseline_p99_ms']:.1f}ms | {c100['sdk_p99_ms']:.1f}ms (+{c100['p99_overhead_pct']:.1f}%) | {c10['sdk_p99_ms']:.1f}ms (+{c10['p99_overhead_pct']:.1f}%) |" + ) + + print() + + # Save results + results_dir = Path(__file__).parent.parent / "results" + results_dir.mkdir(exist_ok=True) + + results = { + "baseline": baseline, + "sdk_100_percent": sdk_100, + "sdk_10_percent": sdk_10, + "comparison_100": comparison_100, + "comparison_10": comparison_10, + } + + with open(results_dir / "realistic-workload.json", "w") as f: + json.dump(results, f, indent=2) + + print(f"Results saved to {results_dir / 'realistic-workload.json'}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench/resource_monitor.py b/benchmarks/bench/resource_monitor.py new file mode 100644 index 0000000..9174e16 --- /dev/null +++ b/benchmarks/bench/resource_monitor.py @@ -0,0 +1,202 @@ +"""Resource monitoring for benchmarks - tracks CPU and memory usage.""" + +from __future__ import annotations + +import resource +import threading +import time +from dataclasses import dataclass, field +from typing import Any + + +@dataclass +class CpuStats: + """CPU usage statistics.""" + + user_percent: float = 0.0 + system_percent: float = 0.0 + total_percent: float = 0.0 + + +@dataclass +class MemoryStats: + """Memory usage statistics.""" + + rss_avg: float = 0.0 + rss_max: float = 0.0 + + +@dataclass +class TaskResourceStats: + """Resource statistics for a single task.""" + + cpu: CpuStats = field(default_factory=CpuStats) + memory: MemoryStats = field(default_factory=MemoryStats) + + +@dataclass +class TaskStats: + """Internal tracking for a task.""" + + rss_sum: float = 0.0 + rss_max: float = 0.0 + start_user_time: float = 0.0 + start_system_time: float = 0.0 + start_time: float = 0.0 + sample_count: int = 0 + + +class ResourceMonitor: + """Monitor CPU and memory usage during benchmark tasks.""" + + def __init__(self, interval_ms: int = 100, enable_memory_tracking: bool = True): + self.interval_ms = interval_ms + self.enable_memory_tracking = enable_memory_tracking + + self._task_stats: dict[str, TaskStats] = {} + self._completed_stats: dict[str, TaskResourceStats] = {} + self._current_task_name: str | None = None + self._current_task_stats: TaskStats | None = None + self._is_running = False + self._monitor_thread: threading.Thread | None = None + self._stop_event = threading.Event() + + def start(self) -> None: + """Start the resource monitor.""" + self._is_running = True + self._stop_event.clear() + + if self.enable_memory_tracking: + self._monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True) + self._monitor_thread.start() + + def stop(self) -> None: + """Stop the resource monitor.""" + self._is_running = False + self._stop_event.set() + if self._monitor_thread: + self._monitor_thread.join(timeout=1) + self._monitor_thread = None + + def start_task(self, task_name: str) -> None: + """Start monitoring a new task.""" + self._current_task_name = task_name + + # Get current CPU times + usage = resource.getrusage(resource.RUSAGE_SELF) + + task_stats = TaskStats( + start_user_time=usage.ru_utime, + start_system_time=usage.ru_stime, + start_time=time.time(), + ) + self._task_stats[task_name] = task_stats + self._current_task_stats = task_stats + + def end_task(self) -> None: + """End monitoring the current task.""" + if not self._current_task_name or not self._current_task_stats: + return + + end_time = time.time() + total_elapsed = end_time - self._current_task_stats.start_time + + # Get final CPU times + usage = resource.getrusage(resource.RUSAGE_SELF) + user_time = usage.ru_utime - self._current_task_stats.start_user_time + system_time = usage.ru_stime - self._current_task_stats.start_system_time + + # Calculate CPU percentages + if total_elapsed > 0: + user_percent = (user_time / total_elapsed) * 100 + system_percent = (system_time / total_elapsed) * 100 + else: + user_percent = 0.0 + system_percent = 0.0 + + total_percent = user_percent + system_percent + + # Calculate memory stats + if self._current_task_stats.sample_count > 0: + rss_avg = self._current_task_stats.rss_sum / self._current_task_stats.sample_count + rss_max = self._current_task_stats.rss_max + else: + rss_avg = 0.0 + rss_max = 0.0 + + resource_stats = TaskResourceStats( + cpu=CpuStats( + user_percent=user_percent, + system_percent=system_percent, + total_percent=total_percent, + ), + memory=MemoryStats( + rss_avg=rss_avg, + rss_max=rss_max, + ), + ) + + self._completed_stats[self._current_task_name] = resource_stats + self._current_task_name = None + self._current_task_stats = None + + def _monitor_loop(self) -> None: + """Background thread for collecting memory samples.""" + while not self._stop_event.wait(self.interval_ms / 1000): + self._collect_memory_sample() + + def _collect_memory_sample(self) -> None: + """Collect a memory usage sample.""" + if not self._is_running or not self._current_task_stats: + return + + # Get RSS from resource module (in bytes on macOS, kilobytes on Linux) + usage = resource.getrusage(resource.RUSAGE_SELF) + # ru_maxrss is in bytes on macOS, kilobytes on Linux + import sys + + if sys.platform == "darwin": + rss = usage.ru_maxrss # Already in bytes + else: + rss = usage.ru_maxrss * 1024 # Convert KB to bytes + + # For more accurate current RSS, try to use /proc on Linux + try: + with open("/proc/self/statm") as f: + # statm gives pages, page size is typically 4096 + pages = int(f.read().split()[1]) + rss = pages * 4096 + except (FileNotFoundError, PermissionError): + pass # Use ru_maxrss fallback + + self._current_task_stats.rss_sum += rss + self._current_task_stats.rss_max = max(self._current_task_stats.rss_max, rss) + self._current_task_stats.sample_count += 1 + + def get_task_stats(self, task_name: str) -> TaskResourceStats | None: + """Get resource statistics for a completed task.""" + return self._completed_stats.get(task_name) + + def get_all_task_names(self) -> list[str]: + """Get names of all completed tasks.""" + return list(self._completed_stats.keys()) + + def to_dict(self, task_name: str) -> dict[str, Any] | None: + """Get task stats as a dictionary.""" + stats = self.get_task_stats(task_name) + if not stats: + return None + + return { + "cpu": { + "userPercent": stats.cpu.user_percent, + "systemPercent": stats.cpu.system_percent, + "totalPercent": stats.cpu.total_percent, + }, + "memory": { + "rss": { + "avg": stats.memory.rss_avg, + "max": stats.memory.rss_max, + }, + }, + } diff --git a/benchmarks/bench/result_utils.py b/benchmarks/bench/result_utils.py new file mode 100644 index 0000000..1e4a2b6 --- /dev/null +++ b/benchmarks/bench/result_utils.py @@ -0,0 +1,337 @@ +"""Result utilities for benchmarks - serialization and comparison.""" + +from __future__ import annotations + +import json +import os +import platform +import statistics +import uuid +from dataclasses import dataclass, field +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from .resource_monitor import ResourceMonitor + +RESULTS_DIR = Path(__file__).parent.parent / "results" + + +@dataclass +class MetricSummary: + """Summary statistics for a metric.""" + + unit: str # "ns" or "ops/s" + mean: float | None = None + median: float | None = None + min: float | None = None + max: float | None = None + std_dev: float | None = None + p99: float | None = None + samples: int = 0 + + +@dataclass +class HistogramBucket: + """A histogram bucket.""" + + min_ns: float + max_ns: float + count: int + + +@dataclass +class TaskBenchmarkResult: + """Benchmark result for a single task.""" + + name: str + samples: int + latency: MetricSummary + throughput: MetricSummary + resource: dict[str, Any] | None = None + histogram: list[HistogramBucket] | None = None + + +@dataclass +class SystemInfo: + """System information.""" + + python_version: str + platform_name: str + arch: str + cpu_count: int + total_memory: int + load_average: list[float] + + +@dataclass +class BenchmarkRunResult: + """Complete benchmark run result.""" + + id: str + label: str + timestamp: str + duration_ms: float + options: dict[str, Any] + system: SystemInfo + tasks: list[TaskBenchmarkResult] = field(default_factory=list) + + +def get_system_info() -> SystemInfo: + """Collect system information.""" + import psutil # ty:ignore[unresolved-import] + + return SystemInfo( + python_version=platform.python_version(), + platform_name=platform.system().lower(), + arch=platform.machine(), + cpu_count=os.cpu_count() or 1, + total_memory=psutil.virtual_memory().total, + load_average=list(os.getloadavg()) if hasattr(os, "getloadavg") else [0.0, 0.0, 0.0], + ) + + +def build_latency_summary(samples_ns: list[float]) -> MetricSummary: + """Build latency summary from samples (in nanoseconds).""" + if not samples_ns: + return MetricSummary(unit="ns", samples=0) + + sorted_samples = sorted(samples_ns) + n = len(sorted_samples) + + # Calculate p99 + p99_idx = int(n * 0.99) + p99 = sorted_samples[min(p99_idx, n - 1)] + + return MetricSummary( + unit="ns", + mean=statistics.mean(samples_ns), + median=statistics.median(samples_ns), + min=min(samples_ns), + max=max(samples_ns), + std_dev=statistics.stdev(samples_ns) if len(samples_ns) > 1 else 0.0, + p99=p99, + samples=n, + ) + + +def build_throughput_summary(samples_ns: list[float]) -> MetricSummary: + """Build throughput summary (ops/s) from latency samples (ns).""" + if not samples_ns: + return MetricSummary(unit="ops/s", samples=0) + + # Convert ns latencies to ops/s + ops_per_sec = [1_000_000_000 / ns if ns > 0 else 0 for ns in samples_ns] + + return MetricSummary( + unit="ops/s", + mean=statistics.mean(ops_per_sec), + median=statistics.median(ops_per_sec), + min=min(ops_per_sec), + max=max(ops_per_sec), + std_dev=statistics.stdev(ops_per_sec) if len(ops_per_sec) > 1 else 0.0, + samples=len(ops_per_sec), + ) + + +def build_histogram(samples_ns: list[float], bucket_count: int = 20) -> list[HistogramBucket]: + """Build a histogram from latency samples.""" + if not samples_ns: + return [] + + sorted_samples = sorted(samples_ns) + min_val = sorted_samples[0] + max_val = sorted_samples[-1] + + if min_val == max_val: + return [HistogramBucket(min_ns=min_val, max_ns=max_val, count=len(samples_ns))] + + bucket_size = (max_val - min_val) / bucket_count + counts = [0] * bucket_count + + for sample in samples_ns: + idx = int((sample - min_val) / bucket_size) + idx = min(idx, bucket_count - 1) + counts[idx] += 1 + + buckets = [] + for i, count in enumerate(counts): + bucket_min = min_val + i * bucket_size + bucket_max = max_val if i == bucket_count - 1 else min_val + (i + 1) * bucket_size + buckets.append(HistogramBucket(min_ns=bucket_min, max_ns=bucket_max, count=count)) + + return buckets + + +def create_task_result( + name: str, + samples_ns: list[float], + resource_monitor: ResourceMonitor, +) -> TaskBenchmarkResult: + """Create a task benchmark result from raw samples.""" + return TaskBenchmarkResult( + name=name, + samples=len(samples_ns), + latency=build_latency_summary(samples_ns), + throughput=build_throughput_summary(samples_ns), + resource=resource_monitor.to_dict(name), + histogram=build_histogram(samples_ns), + ) + + +def create_benchmark_result( + label: str, + tasks: list[TaskBenchmarkResult], + duration_ms: float, + options: dict[str, Any], +) -> BenchmarkRunResult: + """Create a complete benchmark run result.""" + return BenchmarkRunResult( + id=str(uuid.uuid4()), + label=label, + timestamp=datetime.now(UTC).isoformat(), + duration_ms=duration_ms, + options=options, + system=get_system_info(), + tasks=tasks, + ) + + +def metric_to_dict(metric: MetricSummary) -> dict[str, Any]: + """Convert MetricSummary to dict.""" + return { + "unit": metric.unit, + "mean": metric.mean, + "median": metric.median, + "min": metric.min, + "max": metric.max, + "standardDeviation": metric.std_dev, + "p99": metric.p99, + "samples": metric.samples, + } + + +def task_to_dict(task: TaskBenchmarkResult) -> dict[str, Any]: + """Convert TaskBenchmarkResult to dict.""" + result: dict[str, Any] = { + "name": task.name, + "samples": task.samples, + "latency": metric_to_dict(task.latency), + "throughput": metric_to_dict(task.throughput), + } + if task.resource: + result["resource"] = task.resource + if task.histogram: + result["histogram"] = [{"minNs": b.min_ns, "maxNs": b.max_ns, "count": b.count} for b in task.histogram] + return result + + +def result_to_dict(result: BenchmarkRunResult) -> dict[str, Any]: + """Convert BenchmarkRunResult to dict.""" + return { + "id": result.id, + "label": result.label, + "timestamp": result.timestamp, + "durationMs": result.duration_ms, + "options": result.options, + "system": { + "pythonVersion": result.system.python_version, + "platform": result.system.platform_name, + "arch": result.system.arch, + "cpuCount": result.system.cpu_count, + "totalMemory": result.system.total_memory, + "loadAverage": result.system.load_average, + }, + "tasks": [task_to_dict(task) for task in result.tasks], + } + + +def persist_result(result: BenchmarkRunResult) -> Path: + """Save benchmark result to JSON file.""" + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + + sanitized_label = "".join(c if c.isalnum() or c in "-_" else "-" for c in result.label) + filename = f"{sanitized_label}.json" + output_path = RESULTS_DIR / filename + + with open(output_path, "w") as f: + json.dump(result_to_dict(result), f, indent=2) + + return output_path + + +def load_result(label: str) -> BenchmarkRunResult | None: + """Load a benchmark result from JSON file.""" + sanitized_label = "".join(c if c.isalnum() or c in "-_" else "-" for c in label) + filename = f"{sanitized_label}.json" + filepath = RESULTS_DIR / filename + + if not filepath.exists(): + return None + + with open(filepath) as f: + data = json.load(f) + + # Reconstruct the result object + system = SystemInfo( + python_version=data["system"]["pythonVersion"], + platform_name=data["system"]["platform"], + arch=data["system"]["arch"], + cpu_count=data["system"]["cpuCount"], + total_memory=data["system"]["totalMemory"], + load_average=data["system"]["loadAverage"], + ) + + tasks = [] + for task_data in data["tasks"]: + latency = MetricSummary( + unit=task_data["latency"]["unit"], + mean=task_data["latency"].get("mean"), + median=task_data["latency"].get("median"), + min=task_data["latency"].get("min"), + max=task_data["latency"].get("max"), + std_dev=task_data["latency"].get("standardDeviation"), + p99=task_data["latency"].get("p99"), + samples=task_data["latency"].get("samples", 0), + ) + throughput = MetricSummary( + unit=task_data["throughput"]["unit"], + mean=task_data["throughput"].get("mean"), + median=task_data["throughput"].get("median"), + min=task_data["throughput"].get("min"), + max=task_data["throughput"].get("max"), + std_dev=task_data["throughput"].get("standardDeviation"), + samples=task_data["throughput"].get("samples", 0), + ) + + histogram = None + if "histogram" in task_data and task_data["histogram"]: + histogram = [ + HistogramBucket( + min_ns=b["minNs"], + max_ns=b["maxNs"], + count=b["count"], + ) + for b in task_data["histogram"] + ] + + tasks.append( + TaskBenchmarkResult( + name=task_data["name"], + samples=task_data["samples"], + latency=latency, + throughput=throughput, + resource=task_data.get("resource"), + histogram=histogram, + ) + ) + + return BenchmarkRunResult( + id=data["id"], + label=data["label"], + timestamp=data["timestamp"], + duration_ms=data["durationMs"], + options=data["options"], + system=system, + tasks=tasks, + ) diff --git a/benchmarks/bench/sdk_active.py b/benchmarks/bench/sdk_active.py new file mode 100644 index 0000000..59ffb2e --- /dev/null +++ b/benchmarks/bench/sdk_active.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +"""Benchmark: SDK Active (RECORD mode).""" + +import os +import shutil +import sys +from pathlib import Path + +# Set SDK to RECORD mode +os.environ["TUSK_DRIFT_MODE"] = "RECORD" + +# Add project root to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +# Clean up benchmark traces directory +BENCHMARK_TRACE_DIR = Path(__file__).parent.parent / ".benchmark-traces" +if BENCHMARK_TRACE_DIR.exists(): + try: + shutil.rmtree(BENCHMARK_TRACE_DIR) + except Exception as e: + print(f"Warning: Failed to clean benchmark trace directory: {e}") + +# Initialize SDK BEFORE importing common (which imports requests) +from drift import TuskDrift +from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter + +# Initialize the SDK +sdk = TuskDrift.initialize( + api_key="benchmark-test-key", + env="benchmark", + log_level="warn", # Reduce log noise during benchmarks +) + +# Configure filesystem adapter for traces +if sdk.span_exporter: + sdk.span_exporter.clear_adapters() + adapter = FilesystemSpanAdapter(base_directory=BENCHMARK_TRACE_DIR) + sdk.span_exporter.add_adapter(adapter) + +# Mark app as ready +sdk.mark_app_as_ready() + +from benchmarks.bench.common import run_benchmarks + +if __name__ == "__main__": + try: + run_benchmarks(label="sdk-active") + finally: + sdk.shutdown() diff --git a/benchmarks/bench/sdk_active_with_transforms.py b/benchmarks/bench/sdk_active_with_transforms.py new file mode 100644 index 0000000..dde9428 --- /dev/null +++ b/benchmarks/bench/sdk_active_with_transforms.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +"""Benchmark: SDK Active with Transforms (RECORD mode + data transformation).""" + +import os +import shutil +import sys +from pathlib import Path + +# Set SDK to RECORD mode +os.environ["TUSK_DRIFT_MODE"] = "RECORD" + +# Add project root to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +# Clean up benchmark traces directory +BENCHMARK_TRACE_DIR = Path(__file__).parent.parent / ".benchmark-traces-transforms" +if BENCHMARK_TRACE_DIR.exists(): + try: + shutil.rmtree(BENCHMARK_TRACE_DIR) + except Exception as e: + print(f"Warning: Failed to clean benchmark trace directory: {e}") + +# Initialize SDK BEFORE importing common (which imports requests) +from drift import TuskDrift +from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter + +# Define transforms to mask sensitive data +transforms = { + "request": { + "body": [ + {"path": "$.password", "action": "redact"}, + {"path": "$.ssn", "action": "redact"}, + {"path": "$.creditCard", "action": "redact"}, + {"path": "$.email", "action": "hash"}, + ], + }, + "response": { + "body": [ + {"path": "$.token", "action": "redact"}, + {"path": "$.user.email", "action": "hash"}, + {"path": "$.profile.ssn", "action": "redact"}, + {"path": "$.profile.creditCard", "action": "redact"}, + {"path": "$.profile.email", "action": "hash"}, + ], + }, +} + +# Initialize the SDK with transforms +sdk = TuskDrift.initialize( + api_key="benchmark-test-key", + env="benchmark", + transforms=transforms, + log_level="warn", # Reduce log noise during benchmarks +) + +# Configure filesystem adapter for traces +if sdk.span_exporter: + sdk.span_exporter.clear_adapters() + adapter = FilesystemSpanAdapter(base_directory=BENCHMARK_TRACE_DIR) + sdk.span_exporter.add_adapter(adapter) + +# Mark app as ready +sdk.mark_app_as_ready() + +from benchmarks.bench.common import run_benchmarks + +if __name__ == "__main__": + try: + run_benchmarks(label="sdk-active-with-transforms") + finally: + sdk.shutdown() diff --git a/benchmarks/bench/sdk_disabled.py b/benchmarks/bench/sdk_disabled.py new file mode 100644 index 0000000..845c2b5 --- /dev/null +++ b/benchmarks/bench/sdk_disabled.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +"""Benchmark: SDK Disabled (baseline).""" + +import os +import sys + +# Ensure SDK is disabled +os.environ["TUSK_DRIFT_MODE"] = "DISABLED" + +# Add project root to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from benchmarks.bench.common import run_benchmarks + +if __name__ == "__main__": + run_benchmarks(label="sdk-disabled") diff --git a/benchmarks/bench/sdk_sampling_rates.py b/benchmarks/bench/sdk_sampling_rates.py new file mode 100644 index 0000000..929362c --- /dev/null +++ b/benchmarks/bench/sdk_sampling_rates.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +"""Benchmark: SDK with different sampling rates.""" + +import os +import shutil +import sys +from pathlib import Path + +# Set SDK to RECORD mode +os.environ["TUSK_DRIFT_MODE"] = "RECORD" + +# Add project root to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +# Clean up benchmark traces directory +BENCHMARK_TRACE_DIR = Path(__file__).parent.parent / ".benchmark-traces-sampling" + + +def run_with_sampling_rate(sampling_rate: float) -> dict | None: + """Run benchmark with a specific sampling rate.""" + # Clean traces directory + if BENCHMARK_TRACE_DIR.exists(): + try: + shutil.rmtree(BENCHMARK_TRACE_DIR) + except Exception as e: + print(f"Warning: Failed to clean benchmark trace directory: {e}") + + # We need to reset the SDK for each sampling rate test + # This requires a fresh Python process, so we'll use subprocess + import subprocess + + result = subprocess.run( + [ + sys.executable, + "-c", + f""" +import os +import sys +import shutil +from pathlib import Path + +os.environ["TUSK_DRIFT_MODE"] = "RECORD" +sys.path.insert(0, "{os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))}") + +BENCHMARK_TRACE_DIR = Path("{BENCHMARK_TRACE_DIR}") +if BENCHMARK_TRACE_DIR.exists(): + shutil.rmtree(BENCHMARK_TRACE_DIR) + +from drift import TuskDrift +from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter + +sdk = TuskDrift.initialize( + api_key="benchmark-test-key", + env="benchmark", + sampling_rate={sampling_rate}, + log_level="warning", +) + +if sdk.span_exporter: + sdk.span_exporter.clear_adapters() + adapter = FilesystemSpanAdapter(base_directory=BENCHMARK_TRACE_DIR) + sdk.span_exporter.add_adapter(adapter) + +sdk.mark_app_as_ready() + +from benchmarks.bench.common import run_benchmarks +run_benchmarks(label="sdk-sampling-{int(sampling_rate * 100)}", options={{"time_per_task_ms": 5000, "warmup_iterations": 3}}) + +sdk.shutdown() +""", + ], + capture_output=True, + text=True, + cwd=os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + ) + + if result.returncode != 0: + print(f"Error running benchmark with sampling_rate={sampling_rate}:") + print(result.stderr) + return None + + # Print stdout (benchmark output) + print(result.stdout) + + return {"sampling_rate": sampling_rate} + + +def main(): + """Run benchmarks with different sampling rates.""" + sampling_rates = [1.0, 0.5, 0.1, 0.01] + + print("=" * 60) + print("Sampling Rate Impact Benchmarks") + print("=" * 60) + + for rate in sampling_rates: + print(f"\n{'=' * 60}") + print(f"Testing sampling_rate = {rate} ({int(rate * 100)}%)") + print("=" * 60) + run_with_sampling_rate(rate) + + print("\n" + "=" * 60) + print("All sampling rate benchmarks complete!") + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/compare_benchmarks.py b/benchmarks/compare_benchmarks.py new file mode 100644 index 0000000..b2d5489 --- /dev/null +++ b/benchmarks/compare_benchmarks.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +"""Compare benchmark results and generate summary.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from benchmarks.bench.result_utils import ( + BenchmarkRunResult, + TaskBenchmarkResult, + load_result, +) + +# Task base names (without the label suffix) +CPU_HEAVY_TASK_BASE = "High CPU: POST /api/compute-hash" +IO_HEAVY_TASK_BASE = "High IO, Low CPU: POST /api/io-bound" +TRANSFORM_TASK_BASE = "Transform endpoints" + + +def find_task_by_base_name(tasks: list[TaskBenchmarkResult], base_name: str) -> TaskBenchmarkResult | None: + """Find a task by its base name (ignoring the label suffix).""" + for task in tasks: + if task.name.startswith(base_name + " ("): + return task + return None + + +def percentage_change(baseline: float | None, variant: float | None) -> float | None: + """Calculate percentage change from baseline to variant.""" + if baseline is None or variant is None or baseline == 0: + return None + return ((variant - baseline) / baseline) * 100 + + +def format_percentage(value: float | None) -> str: + """Format a percentage value.""" + if value is None: + return "n/a" + sign = "+" if value >= 0 else "" + return f"{sign}{value:.1f}%" + + +def format_latency(ns: float | None) -> str: + """Format latency in appropriate units.""" + if ns is None: + return "n/a" + if ns < 1_000: + return f"{ns:.0f} ns" + if ns < 1_000_000: + return f"{ns / 1_000:.2f} μs" + if ns < 1_000_000_000: + return f"{ns / 1_000_000:.2f} ms" + return f"{ns / 1_000_000_000:.2f} s" + + +def format_throughput(value: float | None) -> str: + """Format throughput in ops/s.""" + if value is None: + return "n/a" + return f"{value:,.0f}" + + +def format_bytes(bytes_val: float | None) -> str: + """Format bytes in MB.""" + if bytes_val is None: + return "n/a" + mb = bytes_val / (1024 * 1024) + return f"{mb:.2f} MB" + + +def format_cpu_percent(value: float | None) -> str: + """Format CPU percentage.""" + if value is None: + return "n/a" + return f"{value:.1f}%" + + +def compute_impact( + label: str, + baseline: TaskBenchmarkResult | None, + variant: TaskBenchmarkResult | None, +) -> dict | None: + """Compute impact metrics between baseline and variant.""" + if not baseline or not variant: + return None + + baseline_throughput = baseline.throughput.mean + variant_throughput = variant.throughput.mean + baseline_p99 = baseline.latency.p99 + variant_p99 = variant.latency.p99 + + # CPU stats + baseline_cpu_user = baseline.resource.get("cpu", {}).get("userPercent") if baseline.resource else None + variant_cpu_user = variant.resource.get("cpu", {}).get("userPercent") if variant.resource else None + baseline_cpu_total = baseline.resource.get("cpu", {}).get("totalPercent") if baseline.resource else None + variant_cpu_total = variant.resource.get("cpu", {}).get("totalPercent") if variant.resource else None + + return { + "label": label, + "throughput_delta_pct": percentage_change(baseline_throughput, variant_throughput), + "tail_latency_delta_pct": percentage_change(baseline_p99, variant_p99), + "baseline_throughput": baseline_throughput, + "variant_throughput": variant_throughput, + "baseline_p99": baseline_p99, + "variant_p99": variant_p99, + "cpu_user_delta_pct": percentage_change(baseline_cpu_user, variant_cpu_user), + "baseline_cpu_total": baseline_cpu_total, + "variant_cpu_total": variant_cpu_total, + } + + +def compute_memory_overhead( + baseline: BenchmarkRunResult, + variant: BenchmarkRunResult, +) -> dict: + """Compute memory overhead between baseline and variant.""" + rss_deltas = [] + rss_max_delta = None + + for baseline_task in baseline.tasks: + # Find matching task in variant + base_name = baseline_task.name.rsplit(" (", 1)[0] + variant_task = find_task_by_base_name(variant.tasks, base_name) + + if not variant_task: + continue + + baseline_mem = baseline_task.resource.get("memory", {}) if baseline_task.resource else {} + variant_mem = variant_task.resource.get("memory", {}) if variant_task.resource else {} + + baseline_rss = baseline_mem.get("rss", {}) + variant_rss = variant_mem.get("rss", {}) + + if baseline_rss.get("avg") is not None and variant_rss.get("avg") is not None: + delta = variant_rss["avg"] - baseline_rss["avg"] + rss_deltas.append(delta) + + if baseline_rss.get("max") is not None and variant_rss.get("max") is not None: + delta = variant_rss["max"] - baseline_rss["max"] + if rss_max_delta is None or delta > rss_max_delta: + rss_max_delta = delta + + return { + "avg_rss_delta": sum(rss_deltas) / len(rss_deltas) if rss_deltas else None, + "max_rss_delta": rss_max_delta, + "samples": len(rss_deltas), + } + + +def main(): + """Main comparison function.""" + print("\nLoading benchmark results...") + + baseline = load_result("sdk-disabled") + active = load_result("sdk-active") + transforms = load_result("sdk-active-with-transforms") + + if not baseline: + print("ERROR: sdk-disabled results not found. Run benchmarks first.") + return + if not active: + print("ERROR: sdk-active results not found. Run benchmarks first.") + return + + print(f"\nBaseline: {baseline.label} ({baseline.timestamp})") + print(f"Active: {active.label} ({active.timestamp})") + if transforms: + print(f"Transforms: {transforms.label} ({transforms.timestamp})") + + # Compute impacts + cpu_impact = compute_impact( + CPU_HEAVY_TASK_BASE, + find_task_by_base_name(baseline.tasks, CPU_HEAVY_TASK_BASE), + find_task_by_base_name(active.tasks, CPU_HEAVY_TASK_BASE), + ) + + io_impact = compute_impact( + IO_HEAVY_TASK_BASE, + find_task_by_base_name(baseline.tasks, IO_HEAVY_TASK_BASE), + find_task_by_base_name(active.tasks, IO_HEAVY_TASK_BASE), + ) + + transform_impact = None + if transforms: + transform_impact = compute_impact( + TRANSFORM_TASK_BASE, + find_task_by_base_name(active.tasks, TRANSFORM_TASK_BASE), + find_task_by_base_name(transforms.tasks, TRANSFORM_TASK_BASE), + ) + + # Print performance impact table + print("\n## Benchmark Impact Summary\n") + print("| Workload | Throughput Δ | Tail Latency (p99) Δ | CPU User Δ |") + print("|----------|-------------|----------------------|------------|") + + if cpu_impact: + print( + f"| **CPU-bound** | {format_percentage(cpu_impact['throughput_delta_pct'])} | " + f"{format_percentage(cpu_impact['tail_latency_delta_pct'])} | " + f"{format_percentage(cpu_impact['cpu_user_delta_pct'])} |" + ) + else: + print("| **CPU-bound** | N/A | N/A | N/A |") + + if io_impact: + print( + f"| **IO-bound** | {format_percentage(io_impact['throughput_delta_pct'])} | " + f"{format_percentage(io_impact['tail_latency_delta_pct'])} | " + f"{format_percentage(io_impact['cpu_user_delta_pct'])} |" + ) + else: + print("| **IO-bound** | N/A | N/A | N/A |") + + if transform_impact: + print( + f"| **Transform endpoints** | {format_percentage(transform_impact['throughput_delta_pct'])} | " + f"{format_percentage(transform_impact['tail_latency_delta_pct'])} | " + f"{format_percentage(transform_impact['cpu_user_delta_pct'])} |" + ) + else: + print("| **Transform endpoints** | N/A | N/A | N/A |") + + # Print absolute values table + print("\n## Absolute Values\n") + print("| Workload | Baseline Throughput | Active Throughput | Baseline p99 | Active p99 |") + print("|----------|--------------------|--------------------|--------------|------------|") + + if cpu_impact: + print( + f"| **CPU-bound** | {format_throughput(cpu_impact['baseline_throughput'])} ops/s | " + f"{format_throughput(cpu_impact['variant_throughput'])} ops/s | " + f"{format_latency(cpu_impact['baseline_p99'])} | " + f"{format_latency(cpu_impact['variant_p99'])} |" + ) + + if io_impact: + print( + f"| **IO-bound** | {format_throughput(io_impact['baseline_throughput'])} ops/s | " + f"{format_throughput(io_impact['variant_throughput'])} ops/s | " + f"{format_latency(io_impact['baseline_p99'])} | " + f"{format_latency(io_impact['variant_p99'])} |" + ) + + # Memory overhead + print("\n## Memory Overhead vs Baseline\n") + print("| Configuration | Avg RSS Δ | Max RSS Δ |") + print("|---------------|-----------|-----------|") + + active_memory = compute_memory_overhead(baseline, active) + if active_memory["samples"] > 0: + print( + f"| **SDK Active** | {format_bytes(active_memory['avg_rss_delta'])} | " + f"{format_bytes(active_memory['max_rss_delta'])} |" + ) + else: + print("| **SDK Active** | N/A | N/A |") + + if transforms: + transform_memory = compute_memory_overhead(baseline, transforms) + if transform_memory["samples"] > 0: + print( + f"| **SDK Active w/ Transforms** | {format_bytes(transform_memory['avg_rss_delta'])} | " + f"{format_bytes(transform_memory['max_rss_delta'])} |" + ) + else: + print("| **SDK Active w/ Transforms** | N/A | N/A |") + + print("\n✓ Summary complete.\n") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/profile/.gitignore b/benchmarks/profile/.gitignore new file mode 100644 index 0000000..fcb8cf5 --- /dev/null +++ b/benchmarks/profile/.gitignore @@ -0,0 +1,2 @@ +# Profile results +results/ diff --git a/benchmarks/profile/profile.sh b/benchmarks/profile/profile.sh new file mode 100755 index 0000000..6583ac9 --- /dev/null +++ b/benchmarks/profile/profile.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# Profile the SDK using different profilers + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(dirname "$(dirname "$SCRIPT_DIR")")" +RESULTS_DIR="$SCRIPT_DIR/results" + +cd "$PROJECT_ROOT" + +mkdir -p "$RESULTS_DIR" + +PROFILER="${1:-cprofile}" # Default to cProfile + +case "$PROFILER" in + "pyspy"|"py-spy"|"flamegraph") + echo "==============================================" + echo "Running py-spy flame graph profiler" + echo "==============================================" + + if ! command -v py-spy &> /dev/null; then + echo "py-spy not found. Install with: pip install py-spy" + echo "Or on macOS: brew install py-spy" + exit 1 + fi + + OUTPUT="$RESULTS_DIR/flamegraph_$(date +%Y%m%d_%H%M%S).svg" + echo "Output: $OUTPUT" + echo "" + + # py-spy needs sudo on macOS for sampling + if [[ "$OSTYPE" == "darwin"* ]]; then + echo "Note: py-spy may require sudo on macOS" + sudo py-spy record -o "$OUTPUT" --rate 100 -- python benchmarks/profile/simple_profile.py + else + py-spy record -o "$OUTPUT" --rate 100 -- python benchmarks/profile/simple_profile.py + fi + + echo "" + echo "Flame graph saved to: $OUTPUT" + echo "Open in browser to view." + ;; + + "cprofile"|"cProfile") + echo "==============================================" + echo "Running cProfile deterministic profiler" + echo "==============================================" + + OUTPUT="$RESULTS_DIR/cprofile_$(date +%Y%m%d_%H%M%S).prof" + echo "Output: $OUTPUT" + echo "" + + python -m cProfile -o "$OUTPUT" benchmarks/profile/simple_profile.py + + echo "" + echo "Profile saved to: $OUTPUT" + echo "" + echo "Analyzing top functions by cumulative time..." + echo "" + + python -c " +import pstats +p = pstats.Stats('$OUTPUT') +p.strip_dirs() +p.sort_stats('cumulative') +print('=== Top 30 by cumulative time ===') +p.print_stats(30) +print() +print('=== Top 30 by total time (self) ===') +p.sort_stats('tottime') +p.print_stats(30) +" + ;; + + "scalene") + echo "==============================================" + echo "Running Scalene CPU/memory profiler" + echo "==============================================" + + if ! command -v scalene &> /dev/null; then + echo "scalene not found. Install with: pip install scalene" + exit 1 + fi + + OUTPUT="$RESULTS_DIR/scalene_$(date +%Y%m%d_%H%M%S).html" + echo "Output: $OUTPUT" + echo "" + + scalene --html --outfile "$OUTPUT" benchmarks/profile/simple_profile.py + + echo "" + echo "Scalene report saved to: $OUTPUT" + ;; + + "viztracer") + echo "==============================================" + echo "Running VizTracer timeline profiler" + echo "==============================================" + + if ! python -c "import viztracer" 2>/dev/null; then + echo "viztracer not found. Install with: pip install viztracer" + exit 1 + fi + + OUTPUT="$RESULTS_DIR/viztracer_$(date +%Y%m%d_%H%M%S).json" + echo "Output: $OUTPUT" + echo "" + + python -m viztracer -o "$OUTPUT" benchmarks/profile/simple_profile.py + + echo "" + echo "VizTracer trace saved to: $OUTPUT" + echo "View with: vizviewer $OUTPUT" + ;; + + *) + echo "Usage: $0 [profiler]" + echo "" + echo "Available profilers:" + echo " cprofile - Built-in deterministic profiler (default)" + echo " pyspy - Sampling profiler with flame graphs" + echo " scalene - CPU/memory profiler with line-level detail" + echo " viztracer - Timeline/trace profiler" + echo "" + echo "Examples:" + echo " $0 cprofile # Run cProfile" + echo " $0 pyspy # Generate flame graph" + echo " $0 scalene # CPU/memory analysis" + exit 1 + ;; +esac + +echo "" +echo "Done!" diff --git a/benchmarks/profile/simple_profile.py b/benchmarks/profile/simple_profile.py new file mode 100644 index 0000000..22bfb1e --- /dev/null +++ b/benchmarks/profile/simple_profile.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +"""Simple profiling script for the SDK. + +This script can be used with various profilers: + +1. py-spy (flame graphs): + py-spy record -o profile.svg -- python benchmarks/profile/simple_profile.py + +2. cProfile (call stats): + python -m cProfile -o profile.prof benchmarks/profile/simple_profile.py + # Then view with: python -m pstats profile.prof + +3. scalene (CPU/memory): + scalene benchmarks/profile/simple_profile.py +""" + +import os +import sys +import time + +# Set SDK to RECORD mode +os.environ["TUSK_DRIFT_MODE"] = "RECORD" + +# Add project root to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from pathlib import Path + +import requests + +from drift import TuskDrift +from drift.core.tracing.adapters.filesystem import FilesystemSpanAdapter + +# Setup traces directory +PROFILE_TRACE_DIR = Path(__file__).parent / "results" / "traces" +PROFILE_TRACE_DIR.mkdir(parents=True, exist_ok=True) + +# Initialize SDK +sdk = TuskDrift.initialize( + api_key="profile-test-key", + env="profile", + log_level="warn", +) + +if sdk.span_exporter: + sdk.span_exporter.clear_adapters() + adapter = FilesystemSpanAdapter(base_directory=PROFILE_TRACE_DIR) + sdk.span_exporter.add_adapter(adapter) + +sdk.mark_app_as_ready() + +# Import server after SDK init +from benchmarks.server.test_server import TestServer + + +def main(): + """Run profiling workload.""" + # Start test server + server = TestServer() + server_info = server.start() + server_url = server_info["url"] + + # Wait for server + time.sleep(0.5) + + # Use session for connection pooling + session = requests.Session() + + print(f"\nTest server started at {server_url}") + print("Running profiling workload...") + + # Number of iterations (adjust for longer/shorter profiling) + iterations = 500 + + start_time = time.time() + + for i in range(iterations): + # Mix of different request types + if i % 3 == 0: + # Simple GET + response = session.get(f"{server_url}/api/simple") + response.json() + elif i % 3 == 1: + # POST with JSON body + response = session.post( + f"{server_url}/api/echo", + json={"data": "test", "iteration": i}, + ) + response.json() + else: + # Sensitive data endpoint (triggers transform checks) + response = session.post( + f"{server_url}/api/auth/login", + json={"email": "test@example.com", "password": "secret123"}, + ) + response.json() + + if (i + 1) % 100 == 0: + elapsed = time.time() - start_time + rate = (i + 1) / elapsed + print(f" {i + 1}/{iterations} iterations ({rate:.1f} req/s)") + + elapsed = time.time() - start_time + print(f"\nCompleted {iterations} iterations in {elapsed:.2f}s") + print(f"Average: {iterations / elapsed:.1f} req/s") + + # Cleanup + session.close() + server.stop() + sdk.shutdown() + + +if __name__ == "__main__": + main() diff --git a/benchmarks/run_benchmarks.sh b/benchmarks/run_benchmarks.sh new file mode 100755 index 0000000..6367d74 --- /dev/null +++ b/benchmarks/run_benchmarks.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Run all benchmarks and compare results + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" + +cd "$PROJECT_ROOT" + +echo "==============================================" +echo "Drift Python SDK Benchmarks" +echo "==============================================" +echo "" + +# Check for psutil +if ! python -c "import psutil" 2>/dev/null; then + echo "Installing psutil..." + uv pip install psutil +fi + +# Check for flask +if ! python -c "import flask" 2>/dev/null; then + echo "Installing flask..." + uv pip install flask +fi + +echo "" +echo "Running SDK Disabled (baseline)..." +echo "----------------------------------------------" +python benchmarks/bench/sdk_disabled.py + +echo "" +echo "Running SDK Active..." +echo "----------------------------------------------" +python benchmarks/bench/sdk_active.py + +echo "" +echo "Running SDK Active with Transforms..." +echo "----------------------------------------------" +python benchmarks/bench/sdk_active_with_transforms.py + +echo "" +echo "==============================================" +echo "Comparing Results" +echo "==============================================" +python benchmarks/compare_benchmarks.py diff --git a/benchmarks/server/__init__.py b/benchmarks/server/__init__.py new file mode 100644 index 0000000..087929f --- /dev/null +++ b/benchmarks/server/__init__.py @@ -0,0 +1 @@ +# Test server for benchmarks diff --git a/benchmarks/server/test_server.py b/benchmarks/server/test_server.py new file mode 100644 index 0000000..5cdddb3 --- /dev/null +++ b/benchmarks/server/test_server.py @@ -0,0 +1,420 @@ +"""Test server for benchmarks - provides various endpoint types for measuring SDK overhead.""" + +from __future__ import annotations + +import hashlib +import json +import threading +from typing import Any + +from flask import Flask, Response, request + + +class TestServer: + """Flask-based test server for benchmarks.""" + + def __init__(self, host: str = "127.0.0.1", port: int = 0): + self.host = host + self.port = port + self.app = Flask(__name__) + self._server_thread: threading.Thread | None = None + self._actual_port: int | None = None + self._setup_routes() + + def _setup_routes(self) -> None: + """Set up all benchmark endpoints.""" + + @self.app.route("/health") + def health() -> Response: + return Response( + json.dumps({"status": "ok"}), + mimetype="application/json", + ) + + @self.app.route("/api/simple") + def simple() -> Response: + """Minimal processing endpoint.""" + import time + + return Response( + json.dumps({"message": "Hello World", "timestamp": int(time.time() * 1000)}), + mimetype="application/json", + ) + + @self.app.route("/api/simple-post", methods=["POST"]) + def simple_post() -> Response: + """Minimal POST endpoint.""" + import time + + return Response( + json.dumps({"message": "Hello World", "timestamp": int(time.time() * 1000)}), + mimetype="application/json", + ) + + @self.app.route("/api/echo", methods=["POST"]) + def echo() -> Response: + """Echo back request body.""" + data = request.get_json(force=True, silent=True) or {} + return Response(json.dumps(data), mimetype="application/json") + + @self.app.route("/api/small") + def small() -> Response: + """Return ~100KB payload.""" + import time + + data = { + "id": "small-123", + "data": "x" * (100 * 1024), + "timestamp": int(time.time() * 1000), + } + return Response(json.dumps(data), mimetype="application/json") + + @self.app.route("/api/small-post", methods=["POST"]) + def small_post() -> Response: + """Accept and return ~100KB payload.""" + import time + + data = { + "id": "small-post-123", + "data": "x" * (100 * 1024), + "timestamp": int(time.time() * 1000), + } + return Response(json.dumps(data), mimetype="application/json") + + @self.app.route("/api/medium") + def medium() -> Response: + """Return ~1MB payload.""" + import time + + data = { + "id": "medium-456", + "data": "x" * (1024 * 1024), + "timestamp": int(time.time() * 1000), + } + return Response(json.dumps(data), mimetype="application/json") + + @self.app.route("/api/medium-post", methods=["POST"]) + def medium_post() -> Response: + """Accept and return ~1MB payload.""" + import time + + data = { + "id": "medium-post-456", + "data": "x" * (1024 * 1024), + "timestamp": int(time.time() * 1000), + } + return Response(json.dumps(data), mimetype="application/json") + + @self.app.route("/api/large") + def large() -> Response: + """Return ~2MB payload.""" + import time + + data = { + "id": "large-789", + "data": "x" * (2 * 1024 * 1024), + "timestamp": int(time.time() * 1000), + } + return Response(json.dumps(data), mimetype="application/json") + + @self.app.route("/api/large-post", methods=["POST"]) + def large_post() -> Response: + """Accept and return ~2MB payload.""" + import time + + data = { + "id": "large-post-789", + "data": "x" * (2 * 1024 * 1024), + "timestamp": int(time.time() * 1000), + } + return Response(json.dumps(data), mimetype="application/json") + + @self.app.route("/api/compute-hash", methods=["POST"]) + def compute_hash() -> Response: + """CPU-intensive endpoint - iterative hashing.""" + data = request.get_json(force=True, silent=True) or {} + input_data = data.get("data", "default-data") + iterations = data.get("iterations", 1000) + + hash_val = input_data + for _ in range(iterations): + hash_val = hashlib.sha256(hash_val.encode()).hexdigest() + + return Response( + json.dumps({"hash": hash_val, "iterations": iterations}), + mimetype="application/json", + ) + + @self.app.route("/api/compute-json", methods=["POST"]) + def compute_json() -> Response: + """CPU-intensive endpoint - JSON parsing/stringifying.""" + req_data = request.get_json(force=True, silent=True) or {} + iterations = req_data.get("iterations", 100) + data = req_data.get("data", {"test": "data", "nested": {"value": 123}}) + + result = data + for i in range(iterations): + str_data = json.dumps(result) + result = json.loads(str_data) + result["iteration"] = i + + return Response(json.dumps(result), mimetype="application/json") + + @self.app.route("/api/auth/login", methods=["POST"]) + def auth_login() -> Response: + """Endpoint with sensitive data for transform testing.""" + data = request.get_json(force=True, silent=True) or {} + return Response( + json.dumps( + { + "success": True, + "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", + "user": { + "id": 123, + "email": data.get("email", "user@example.com"), + "role": "user", + }, + } + ), + mimetype="application/json", + ) + + @self.app.route("/api/users", methods=["POST"]) + def create_user() -> Response: + """Endpoint with nested sensitive data.""" + import time + + data = request.get_json(force=True, silent=True) or {} + return Response( + json.dumps( + { + "id": int(time.time() * 1000), + "username": data.get("username", "testuser"), + "profile": { + "email": data.get("email", "test@example.com"), + "ssn": data.get("ssn", "123-45-6789"), + "creditCard": data.get("creditCard", "4111-1111-1111-1111"), + }, + "createdAt": "2025-01-14T00:00:00Z", + } + ), + mimetype="application/json", + ) + + @self.app.route("/api/io-bound", methods=["POST"]) + def io_bound() -> Response: + """High IO, low CPU endpoint - simulates IO-bound work.""" + import time + + data = request.get_json(force=True, silent=True) or {} + jobs = data.get("jobs", 5) + delay_ms = data.get("delayMs", 5) + + results = [] + for i in range(jobs): + time.sleep(delay_ms / 1000) + results.append( + { + "jobId": i, + "timestamp": int(time.time() * 1000), + "status": "completed", + } + ) + + return Response( + json.dumps( + { + "totalJobs": jobs, + "results": results, + "completedAt": int(time.time() * 1000), + } + ), + mimetype="application/json", + ) + + @self.app.route("/api/slow") + def slow() -> Response: + """Slow endpoint - simulates slow processing.""" + import time + + time.sleep(0.1) + return Response( + json.dumps({"message": "Slow response", "timestamp": int(time.time() * 1000)}), + mimetype="application/json", + ) + + @self.app.route("/api/error") + def error() -> Response: + """Error endpoint.""" + return Response( + json.dumps({"error": "Internal Server Error"}), + status=500, + mimetype="application/json", + ) + + @self.app.route("/api/realistic", methods=["POST"]) + def realistic() -> Response: + """Realistic API endpoint - simulates typical production workload. + + This endpoint simulates a real API that: + 1. Validates input (light CPU) + 2. Queries a database (I/O wait - simulated with sleep) + 3. Processes results (moderate CPU) + 4. Returns response + + Target: ~10-20ms response time (typical for production APIs) + """ + import re + import time + + data = request.get_json(force=True, silent=True) or {} + + # 1. Input validation (light CPU work) + user_id = data.get("userId", "user-123") + query = data.get("query", "default query") + + # Validate email format if provided + email = data.get("email", "") + email_valid = bool(re.match(r"[^@]+@[^@]+\.[^@]+", email)) if email else True + + # 2. Simulate database query (I/O wait ~10ms) + time.sleep(0.010) + + # 3. Process results (moderate CPU - JSON manipulation) + results = [] + for i in range(10): + item = { + "id": f"item-{i}", + "userId": user_id, + "data": {"index": i, "query": query[:50]}, + "score": (i + 1) * 0.1, + } + # Serialize and deserialize (simulates data transformation) + item_str = json.dumps(item) + results.append(json.loads(item_str)) + + # 4. Build response + response_data = { + "success": True, + "userId": user_id, + "emailValid": email_valid, + "resultCount": len(results), + "results": results, + "processingTimeMs": 10, # Approximate + "timestamp": int(time.time() * 1000), + } + + return Response(json.dumps(response_data), mimetype="application/json") + + @self.app.route("/api/typical-read", methods=["GET"]) + def typical_read() -> Response: + """Typical read endpoint - simulates fetching data. + + Simulates: Auth check + DB read + response serialization + Target: ~5-10ms response time + """ + import time + + # Simulate auth token validation (light CPU) + auth_header = request.headers.get("Authorization", "") + is_authenticated = len(auth_header) > 10 + + # Simulate database read (~5ms) + time.sleep(0.005) + + # Build response + data = { + "id": "resource-123", + "name": "Example Resource", + "description": "This is a typical API response", + "metadata": { + "createdAt": "2025-01-14T00:00:00Z", + "updatedAt": "2025-01-14T12:00:00Z", + "version": 1, + }, + "authenticated": is_authenticated, + "timestamp": int(time.time() * 1000), + } + + return Response(json.dumps(data), mimetype="application/json") + + @self.app.route("/api/typical-write", methods=["POST"]) + def typical_write() -> Response: + """Typical write endpoint - simulates creating/updating data. + + Simulates: Validation + DB write + response + Target: ~15-25ms response time + """ + import time + + data = request.get_json(force=True, silent=True) or {} + + # Input validation (light CPU) + name = data.get("name", "") + if len(name) < 1: + name = "default" + + # Simulate database write (~15ms - writes are slower than reads) + time.sleep(0.015) + + # Build response + response_data = { + "id": f"new-{int(time.time() * 1000)}", + "name": name, + "status": "created", + "timestamp": int(time.time() * 1000), + } + + return Response(json.dumps(response_data), mimetype="application/json") + + def start(self) -> dict[str, Any]: + """Start the test server in a background thread.""" + import socket + + from werkzeug.serving import make_server + + # Find a free port if port is 0 + if self.port == 0: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind((self.host, 0)) + self._actual_port = s.getsockname()[1] + else: + self._actual_port = self.port + + self._server = make_server(self.host, self._actual_port, self.app, threaded=True) + + def run_server(): + self._server.serve_forever() + + self._server_thread = threading.Thread(target=run_server, daemon=True) + self._server_thread.start() + + url = f"http://{self.host}:{self._actual_port}" + print(f"Test server started at {url}") + + return { + "port": self._actual_port, + "host": self.host, + "url": url, + } + + def stop(self) -> None: + """Stop the test server.""" + if hasattr(self, "_server"): + self._server.shutdown() + if self._server_thread: + self._server_thread.join(timeout=5) + print("Test server stopped") + + def get_url(self) -> str: + """Get the server URL.""" + return f"http://{self.host}:{self._actual_port}" + + +if __name__ == "__main__": + # Quick test + server = TestServer() + info = server.start() + print(f"Server running at {info['url']}") + input("Press Enter to stop...") + server.stop() From bcd4a067c1be343f84ae0c8caf6e6dc9818c7a34 Mon Sep 17 00:00:00 2001 From: JY Tan Date: Mon, 19 Jan 2026 19:32:27 -0800 Subject: [PATCH 2/4] Some changes --- .github/workflows/benchmarks.yml | 276 +++++++++++++++++++++++++++ benchmarks/README.md | 33 ++++ benchmarks/bench/common.py | 7 +- benchmarks/bench/resource_monitor.py | 14 +- benchmarks/bench/result_utils.py | 4 +- benchmarks/profile/profile.sh | 8 +- benchmarks/profile/simple_profile.py | 85 +++++---- benchmarks/run_benchmarks.sh | 6 + benchmarks/scripts/compare_runs.py | 177 +++++++++++++++++ benchmarks/server/test_server.py | 16 +- 10 files changed, 564 insertions(+), 62 deletions(-) create mode 100644 .github/workflows/benchmarks.yml create mode 100755 benchmarks/scripts/compare_runs.py diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 0000000..7ff1749 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,276 @@ +name: Benchmarks + +on: + workflow_dispatch: + inputs: + iterations: + description: "Number of iterations for realistic workload benchmark" + required: false + default: "200" + qps_duration: + description: "Duration in seconds for each QPS level" + required: false + default: "10" + compare_with: + description: "Run ID to compare results against (optional)" + required: false + default: "" + +jobs: + benchmark: + name: Run Benchmarks + runs-on: ubuntu-latest + timeout-minutes: 30 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Setup Python + run: uv python install 3.9 + + - name: Cache uv + Python installs + venv + uses: actions/cache@v4 + with: + path: | + ~/.cache/uv + ~/.local/share/uv/python + .venv + key: ${{ runner.os }}-uv-benchmark-3.9-${{ hashFiles('uv.lock') }} + + - name: Install dependencies + run: | + uv sync --all-extras + uv pip install flask requests psutil + + - name: Get system info + id: sysinfo + run: | + echo "python_version=$(python --version)" >> $GITHUB_OUTPUT + echo "os=$(uname -s)" >> $GITHUB_OUTPUT + echo "arch=$(uname -m)" >> $GITHUB_OUTPUT + echo "cpu_count=$(nproc)" >> $GITHUB_OUTPUT + echo "memory_gb=$(free -g | awk '/^Mem:/{print $2}')" >> $GITHUB_OUTPUT + + - name: Run realistic workload benchmark + id: realistic + run: | + uv run python benchmarks/bench/realistic_workload.py 2>&1 | tee realistic_output.txt + # Extract just the results JSON + cat benchmarks/results/realistic-workload.json + + - name: Run fixed QPS latency benchmark + id: fixed_qps + run: | + uv run python benchmarks/bench/fixed_qps_latency.py 2>&1 | tee fixed_qps_output.txt + # Extract just the results JSON + cat benchmarks/results/fixed-qps-latency.json + + - name: Generate structured results + id: results + run: | + cat > benchmarks/results/benchmark-summary.json << 'EOF' + { + "metadata": { + "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", + "run_id": "${{ github.run_id }}", + "run_number": "${{ github.run_number }}", + "commit_sha": "${{ github.sha }}", + "branch": "${{ github.ref_name }}", + "triggered_by": "${{ github.actor }}", + "environment": { + "python_version": "${{ steps.sysinfo.outputs.python_version }}", + "os": "${{ steps.sysinfo.outputs.os }}", + "arch": "${{ steps.sysinfo.outputs.arch }}", + "cpu_count": "${{ steps.sysinfo.outputs.cpu_count }}", + "memory_gb": "${{ steps.sysinfo.outputs.memory_gb }}" + } + } + } + EOF + + # Create a proper JSON with jq + jq -n \ + --slurpfile realistic benchmarks/results/realistic-workload.json \ + --slurpfile fixed_qps benchmarks/results/fixed-qps-latency.json \ + --arg timestamp "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --arg run_id "${{ github.run_id }}" \ + --arg run_number "${{ github.run_number }}" \ + --arg commit_sha "${{ github.sha }}" \ + --arg branch "${{ github.ref_name }}" \ + --arg triggered_by "${{ github.actor }}" \ + --arg python_version "${{ steps.sysinfo.outputs.python_version }}" \ + --arg os "${{ steps.sysinfo.outputs.os }}" \ + --arg arch "${{ steps.sysinfo.outputs.arch }}" \ + --arg cpu_count "${{ steps.sysinfo.outputs.cpu_count }}" \ + --arg memory_gb "${{ steps.sysinfo.outputs.memory_gb }}" \ + '{ + metadata: { + timestamp: $timestamp, + run_id: $run_id, + run_number: ($run_number | tonumber), + commit_sha: $commit_sha, + branch: $branch, + triggered_by: $triggered_by, + environment: { + python_version: $python_version, + os: $os, + arch: $arch, + cpu_count: ($cpu_count | tonumber), + memory_gb: ($memory_gb | tonumber) + } + }, + realistic_workload: $realistic[0], + fixed_qps_latency: $fixed_qps[0] + }' > benchmarks/results/benchmark-summary.json + + - name: Generate markdown summary + run: | + SUMMARY_FILE="benchmarks/results/benchmark-summary.md" + + cat > "$SUMMARY_FILE" << EOF + # Benchmark Results + + **Date**: $(date -u +%Y-%m-%d) + **Commit**: ${{ github.sha }} + **Branch**: ${{ github.ref_name }} + **Run ID**: ${{ github.run_id }} + + ## Environment + - Python: ${{ steps.sysinfo.outputs.python_version }} + - OS: ${{ steps.sysinfo.outputs.os }} (${{ steps.sysinfo.outputs.arch }}) + - CPUs: ${{ steps.sysinfo.outputs.cpu_count }} + - Memory: ${{ steps.sysinfo.outputs.memory_gb }} GB + + ## Realistic Workload Results + + EOF + + # Parse and format realistic workload results + jq -r ' + "| Endpoint | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |", + "|----------|----------|------------|----------|-----------|----------|", + (.comparison_100 | to_entries[] | + "| \(.key) | \(.value.baseline_mean_ms | . * 10 | round / 10)ms | \(.value.sdk_mean_ms | . * 10 | round / 10)ms | +\(.value.mean_overhead_ms | . * 10 | round / 10)ms (\(.value.mean_overhead_pct | round)%) | - | - |" + ) + ' benchmarks/results/realistic-workload.json >> "$SUMMARY_FILE" + + cat >> "$SUMMARY_FILE" << 'EOF' + + ## Fixed QPS Latency Results + + ### Mean Latency + + EOF + + jq -r ' + "| QPS | Baseline | SDK (100%) | Overhead | SDK (10%) | Overhead |", + "|-----|----------|------------|----------|-----------|----------|", + (.baseline | to_entries[] | + . as $b | + ($b.key | tostring) as $qps | + "| \($qps) | \($b.value.mean_ms | . * 10 | round / 10)ms | - | - | - | - |" + ) + ' benchmarks/results/fixed-qps-latency.json >> "$SUMMARY_FILE" + + cat >> "$SUMMARY_FILE" << 'EOF' + + --- + + 📊 **Full results available in artifacts** + + EOF + + # Also write to GitHub step summary for UI display + cat "$SUMMARY_FILE" >> $GITHUB_STEP_SUMMARY + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + with: + name: benchmark-results-${{ github.run_id }} + path: | + benchmarks/results/*.json + benchmarks/results/*.md + realistic_output.txt + fixed_qps_output.txt + retention-days: 90 + + - name: Download comparison results (if specified) + if: ${{ inputs.compare_with != '' }} + uses: actions/download-artifact@v4 + with: + name: benchmark-results-${{ inputs.compare_with }} + path: benchmarks/results/comparison/ + continue-on-error: true + + - name: Compare with previous run + if: ${{ inputs.compare_with != '' }} + run: | + if [ -f benchmarks/results/comparison/benchmark-summary.json ]; then + echo "## Comparison with Run ${{ inputs.compare_with }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + # Compare realistic workload results + PREV_READ=$(jq '.realistic_workload.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/comparison/benchmark-summary.json) + CURR_READ=$(jq '.realistic_workload.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/benchmark-summary.json) + + PREV_WRITE=$(jq '.realistic_workload.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/comparison/benchmark-summary.json) + CURR_WRITE=$(jq '.realistic_workload.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/benchmark-summary.json) + + echo "| Metric | Previous | Current | Delta |" >> $GITHUB_STEP_SUMMARY + echo "|--------|----------|---------|-------|" >> $GITHUB_STEP_SUMMARY + echo "| Read API overhead | ${PREV_READ}ms | ${CURR_READ}ms | $(echo "$CURR_READ - $PREV_READ" | bc)ms |" >> $GITHUB_STEP_SUMMARY + echo "| Write API overhead | ${PREV_WRITE}ms | ${CURR_WRITE}ms | $(echo "$CURR_WRITE - $PREV_WRITE" | bc)ms |" >> $GITHUB_STEP_SUMMARY + else + echo "⚠️ Could not find comparison results for run ${{ inputs.compare_with }}" >> $GITHUB_STEP_SUMMARY + fi + + - name: Check for performance regression + id: regression + run: | + # Check if overhead exceeds threshold (3ms for 100% sampling) + THRESHOLD_MS=3.0 + + READ_OVERHEAD=$(jq '.comparison_100.typical_read.mean_overhead_ms' benchmarks/results/realistic-workload.json) + WRITE_OVERHEAD=$(jq '.comparison_100.typical_write.mean_overhead_ms' benchmarks/results/realistic-workload.json) + MIXED_OVERHEAD=$(jq '.comparison_100.realistic_mixed.mean_overhead_ms' benchmarks/results/realistic-workload.json) + + REGRESSION=false + + if (( $(echo "$READ_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then + echo "⚠️ Read API overhead ($READ_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY + REGRESSION=true + fi + + if (( $(echo "$WRITE_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then + echo "⚠️ Write API overhead ($WRITE_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY + REGRESSION=true + fi + + if (( $(echo "$MIXED_OVERHEAD > $THRESHOLD_MS" | bc -l) )); then + echo "⚠️ Mixed API overhead ($MIXED_OVERHEAD ms) exceeds threshold ($THRESHOLD_MS ms)" >> $GITHUB_STEP_SUMMARY + REGRESSION=true + fi + + if [ "$REGRESSION" = true ]; then + echo "" >> $GITHUB_STEP_SUMMARY + echo "### ⚠️ Performance regression detected" >> $GITHUB_STEP_SUMMARY + echo "regression=true" >> $GITHUB_OUTPUT + else + echo "" >> $GITHUB_STEP_SUMMARY + echo "### ✅ No performance regression detected" >> $GITHUB_STEP_SUMMARY + echo "regression=false" >> $GITHUB_OUTPUT + fi + + - name: Output JSON results + run: | + echo "### Structured Results (JSON)" + echo "" + echo '```json' + cat benchmarks/results/benchmark-summary.json + echo '```' diff --git a/benchmarks/README.md b/benchmarks/README.md index 594982c..1489e9b 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -56,6 +56,36 @@ After running benchmarks, compare the results: python benchmarks/compare_benchmarks.py ``` +### Comparing Different Runs + +To compare results from different benchmark runs: + +```bash +# Compare two result files +python benchmarks/scripts/compare_runs.py results/old-results.json results/new-results.json + +# Output as JSON only (for programmatic use) +python benchmarks/scripts/compare_runs.py --json results/old.json results/new.json +``` + +### GitHub Actions Workflow + +Benchmarks can be run via GitHub Actions for consistent, reproducible results: + +1. Go to **Actions** → **Benchmarks** +2. Click **Run workflow** +3. Optionally specify: + - `iterations`: Number of iterations for realistic workload (default: 200) + - `qps_duration`: Duration per QPS level in seconds (default: 10) + - `compare_with`: Run ID to compare against (optional) + +The workflow outputs: + +- Structured JSON with all results and metadata +- Markdown summary in the workflow run +- Artifacts containing full results (retained for 90 days) +- Regression detection if overhead exceeds 3ms threshold + ### Configuration You can configure benchmarks via environment variables: @@ -151,11 +181,14 @@ benchmarks/ │ ├── profile.sh # Profiler runner script │ ├── simple_profile.py # Profiling workload │ └── results/ # Profile output (gitignored) +├── scripts/ +│ └── compare_runs.py # Compare benchmark results across runs ├── server/ │ └── test_server.py # Flask test server ├── results/ # JSON output (gitignored) ├── compare_benchmarks.py # Result comparison script ├── run_benchmarks.sh # Runner script ├── PROFILING.md # Profiling documentation +├── RESULTS.md # Historical results └── README.md ``` diff --git a/benchmarks/bench/common.py b/benchmarks/bench/common.py index d68a156..76e331e 100644 --- a/benchmarks/bench/common.py +++ b/benchmarks/bench/common.py @@ -49,7 +49,12 @@ def run_benchmarks( """Run all benchmarks with the given label.""" opts = {**DEFAULT_OPTIONS, **(options or {})} - enable_memory_tracking = os.environ.get("BENCHMARK_ENABLE_MEMORY", "true").lower() != "false" + # Check environment variable first, then fall back to options + env_memory = os.environ.get("BENCHMARK_ENABLE_MEMORY") + if env_memory is not None: + enable_memory_tracking = env_memory.lower() != "false" + else: + enable_memory_tracking = bool(opts.get("enable_memory_tracking", True)) # Start test server server = TestServer() diff --git a/benchmarks/bench/resource_monitor.py b/benchmarks/bench/resource_monitor.py index 9174e16..8572d24 100644 --- a/benchmarks/bench/resource_monitor.py +++ b/benchmarks/bench/resource_monitor.py @@ -147,7 +147,13 @@ def _monitor_loop(self) -> None: def _collect_memory_sample(self) -> None: """Collect a memory usage sample.""" - if not self._is_running or not self._current_task_stats: + if not self._is_running: + return + + # Take a local snapshot to avoid race condition where another thread + # sets _current_task_stats to None between check and usage + task_stats = self._current_task_stats + if task_stats is None: return # Get RSS from resource module (in bytes on macOS, kilobytes on Linux) @@ -169,9 +175,9 @@ def _collect_memory_sample(self) -> None: except (FileNotFoundError, PermissionError): pass # Use ru_maxrss fallback - self._current_task_stats.rss_sum += rss - self._current_task_stats.rss_max = max(self._current_task_stats.rss_max, rss) - self._current_task_stats.sample_count += 1 + task_stats.rss_sum += rss + task_stats.rss_max = max(task_stats.rss_max, rss) + task_stats.sample_count += 1 def get_task_stats(self, task_name: str) -> TaskResourceStats | None: """Get resource statistics for a completed task.""" diff --git a/benchmarks/bench/result_utils.py b/benchmarks/bench/result_utils.py index 1e4a2b6..d581ab6 100644 --- a/benchmarks/bench/result_utils.py +++ b/benchmarks/bench/result_utils.py @@ -8,7 +8,7 @@ import statistics import uuid from dataclasses import dataclass, field -from datetime import UTC, datetime +from datetime import datetime, timezone from pathlib import Path from typing import Any @@ -189,7 +189,7 @@ def create_benchmark_result( return BenchmarkRunResult( id=str(uuid.uuid4()), label=label, - timestamp=datetime.now(UTC).isoformat(), + timestamp=datetime.now(timezone.utc).isoformat(), duration_ms=duration_ms, options=options, system=get_system_info(), diff --git a/benchmarks/profile/profile.sh b/benchmarks/profile/profile.sh index 6583ac9..84f3ca8 100755 --- a/benchmarks/profile/profile.sh +++ b/benchmarks/profile/profile.sh @@ -25,6 +25,10 @@ case "$PROFILER" in exit 1 fi + # Capture resolved binary path before sudo (root's PATH may not include user pip bin) + PYSPY_BIN="$(command -v py-spy)" + PYTHON_BIN="$(command -v python)" + OUTPUT="$RESULTS_DIR/flamegraph_$(date +%Y%m%d_%H%M%S).svg" echo "Output: $OUTPUT" echo "" @@ -32,9 +36,9 @@ case "$PROFILER" in # py-spy needs sudo on macOS for sampling if [[ "$OSTYPE" == "darwin"* ]]; then echo "Note: py-spy may require sudo on macOS" - sudo py-spy record -o "$OUTPUT" --rate 100 -- python benchmarks/profile/simple_profile.py + sudo "$PYSPY_BIN" record -o "$OUTPUT" --rate 100 -- "$PYTHON_BIN" benchmarks/profile/simple_profile.py else - py-spy record -o "$OUTPUT" --rate 100 -- python benchmarks/profile/simple_profile.py + "$PYSPY_BIN" record -o "$OUTPUT" --rate 100 -- "$PYTHON_BIN" benchmarks/profile/simple_profile.py fi echo "" diff --git a/benchmarks/profile/simple_profile.py b/benchmarks/profile/simple_profile.py index 22bfb1e..93f3b76 100644 --- a/benchmarks/profile/simple_profile.py +++ b/benchmarks/profile/simple_profile.py @@ -66,48 +66,49 @@ def main(): # Use session for connection pooling session = requests.Session() - print(f"\nTest server started at {server_url}") - print("Running profiling workload...") - - # Number of iterations (adjust for longer/shorter profiling) - iterations = 500 - - start_time = time.time() - - for i in range(iterations): - # Mix of different request types - if i % 3 == 0: - # Simple GET - response = session.get(f"{server_url}/api/simple") - response.json() - elif i % 3 == 1: - # POST with JSON body - response = session.post( - f"{server_url}/api/echo", - json={"data": "test", "iteration": i}, - ) - response.json() - else: - # Sensitive data endpoint (triggers transform checks) - response = session.post( - f"{server_url}/api/auth/login", - json={"email": "test@example.com", "password": "secret123"}, - ) - response.json() - - if (i + 1) % 100 == 0: - elapsed = time.time() - start_time - rate = (i + 1) / elapsed - print(f" {i + 1}/{iterations} iterations ({rate:.1f} req/s)") - - elapsed = time.time() - start_time - print(f"\nCompleted {iterations} iterations in {elapsed:.2f}s") - print(f"Average: {iterations / elapsed:.1f} req/s") - - # Cleanup - session.close() - server.stop() - sdk.shutdown() + try: + print(f"\nTest server started at {server_url}") + print("Running profiling workload...") + + # Number of iterations (adjust for longer/shorter profiling) + iterations = 500 + + start_time = time.time() + + for i in range(iterations): + # Mix of different request types + if i % 3 == 0: + # Simple GET + response = session.get(f"{server_url}/api/simple") + response.json() + elif i % 3 == 1: + # POST with JSON body + response = session.post( + f"{server_url}/api/echo", + json={"data": "test", "iteration": i}, + ) + response.json() + else: + # Sensitive data endpoint (triggers transform checks) + response = session.post( + f"{server_url}/api/auth/login", + json={"email": "test@example.com", "password": "secret123"}, + ) + response.json() + + if (i + 1) % 100 == 0: + elapsed = time.time() - start_time + rate = (i + 1) / elapsed + print(f" {i + 1}/{iterations} iterations ({rate:.1f} req/s)") + + elapsed = time.time() - start_time + print(f"\nCompleted {iterations} iterations in {elapsed:.2f}s") + print(f"Average: {iterations / elapsed:.1f} req/s") + finally: + # Cleanup - always execute even if an exception is raised + session.close() + server.stop() + sdk.shutdown() if __name__ == "__main__": diff --git a/benchmarks/run_benchmarks.sh b/benchmarks/run_benchmarks.sh index 6367d74..1266f49 100755 --- a/benchmarks/run_benchmarks.sh +++ b/benchmarks/run_benchmarks.sh @@ -25,6 +25,12 @@ if ! python -c "import flask" 2>/dev/null; then uv pip install flask fi +# Check for requests +if ! python -c "import requests" 2>/dev/null; then + echo "Installing requests..." + uv pip install requests +fi + echo "" echo "Running SDK Disabled (baseline)..." echo "----------------------------------------------" diff --git a/benchmarks/scripts/compare_runs.py b/benchmarks/scripts/compare_runs.py new file mode 100755 index 0000000..18351ea --- /dev/null +++ b/benchmarks/scripts/compare_runs.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +"""Compare benchmark results from different runs. + +Usage: + python compare_runs.py results1.json results2.json + python compare_runs.py --download RUN_ID1 RUN_ID2 # Download from GitHub Actions +""" + +import argparse +import json + + +def load_results(path: str) -> dict: + """Load benchmark results from JSON file.""" + with open(path) as f: + return json.load(f) + + +def compare_realistic_workload(old: dict, new: dict) -> list[dict]: + """Compare realistic workload results.""" + comparisons = [] + + old_comp = old.get("comparison_100", old.get("realistic_workload", {}).get("comparison_100", {})) + new_comp = new.get("comparison_100", new.get("realistic_workload", {}).get("comparison_100", {})) + + for endpoint in ["typical_read", "typical_write", "realistic_mixed"]: + if endpoint in old_comp and endpoint in new_comp: + old_overhead = old_comp[endpoint]["mean_overhead_ms"] + new_overhead = new_comp[endpoint]["mean_overhead_ms"] + delta = new_overhead - old_overhead + delta_pct = (delta / old_overhead * 100) if old_overhead != 0 else 0 + + comparisons.append( + { + "endpoint": endpoint, + "old_overhead_ms": round(old_overhead, 2), + "new_overhead_ms": round(new_overhead, 2), + "delta_ms": round(delta, 2), + "delta_pct": round(delta_pct, 1), + "regression": delta > 0.5, # Flag if overhead increased by more than 0.5ms + } + ) + + return comparisons + + +def compare_fixed_qps(old: dict, new: dict) -> list[dict]: + """Compare fixed QPS results.""" + comparisons = [] + + old_baseline = old.get("baseline", old.get("fixed_qps_latency", {}).get("baseline", {})) + new_baseline = new.get("baseline", new.get("fixed_qps_latency", {}).get("baseline", {})) + + old_sdk = old.get("sdk_100", old.get("fixed_qps_latency", {}).get("sdk_100", {})) + new_sdk = new.get("sdk_100", new.get("fixed_qps_latency", {}).get("sdk_100", {})) + + for qps in ["25", "50", "75"]: + if qps in old_sdk and qps in new_sdk: + old_overhead = old_sdk[qps]["mean_ms"] - old_baseline.get(qps, {}).get("mean_ms", 0) + new_overhead = new_sdk[qps]["mean_ms"] - new_baseline.get(qps, {}).get("mean_ms", 0) + delta = new_overhead - old_overhead + + comparisons.append( + { + "qps": int(qps), + "old_overhead_ms": round(old_overhead, 2), + "new_overhead_ms": round(new_overhead, 2), + "delta_ms": round(delta, 2), + "regression": delta > 0.5, + } + ) + + return comparisons + + +def print_comparison(old_path: str, new_path: str, old: dict, new: dict) -> None: + """Print comparison results.""" + print("=" * 70) + print("Benchmark Comparison") + print("=" * 70) + print() + print(f"Old: {old_path}") + print(f"New: {new_path}") + print() + + # Metadata comparison if available + old_meta = old.get("metadata", {}) + new_meta = new.get("metadata", {}) + + if old_meta or new_meta: + print("### Metadata") + print(f" Old run: {old_meta.get('run_id', 'N/A')} @ {old_meta.get('timestamp', 'N/A')}") + print(f" New run: {new_meta.get('run_id', 'N/A')} @ {new_meta.get('timestamp', 'N/A')}") + print() + + # Realistic workload comparison + print("### Realistic Workload (100% sampling)") + print() + print("| Endpoint | Old Overhead | New Overhead | Delta | Status |") + print("|----------|--------------|--------------|-------|--------|") + + realistic_comps = compare_realistic_workload(old, new) + has_regression = False + + for comp in realistic_comps: + status = "⚠️ REGRESSION" if comp["regression"] else "✅ OK" + if comp["regression"]: + has_regression = True + print( + f"| {comp['endpoint']} | {comp['old_overhead_ms']}ms | {comp['new_overhead_ms']}ms | {comp['delta_ms']:+.2f}ms | {status} |" + ) + + print() + + # Fixed QPS comparison + print("### Fixed QPS Latency (100% sampling)") + print() + print("| QPS | Old Overhead | New Overhead | Delta | Status |") + print("|-----|--------------|--------------|-------|--------|") + + qps_comps = compare_fixed_qps(old, new) + + for comp in qps_comps: + status = "⚠️ REGRESSION" if comp["regression"] else "✅ OK" + if comp["regression"]: + has_regression = True + print( + f"| {comp['qps']} | {comp['old_overhead_ms']}ms | {comp['new_overhead_ms']}ms | {comp['delta_ms']:+.2f}ms | {status} |" + ) + + print() + + if has_regression: + print("⚠️ REGRESSION DETECTED: Some metrics show increased overhead") + else: + print("✅ No regressions detected") + + # Output as JSON for programmatic use + output = { + "realistic_workload": realistic_comps, + "fixed_qps": qps_comps, + "has_regression": has_regression, + } + + print() + print("### JSON Output") + print(json.dumps(output, indent=2)) + + +def main(): + parser = argparse.ArgumentParser(description="Compare benchmark results") + parser.add_argument("old", help="Path to old results JSON") + parser.add_argument("new", help="Path to new results JSON") + parser.add_argument("--json", action="store_true", help="Output only JSON") + + args = parser.parse_args() + + old = load_results(args.old) + new = load_results(args.new) + + if args.json: + realistic_comps = compare_realistic_workload(old, new) + qps_comps = compare_fixed_qps(old, new) + has_regression = any(c["regression"] for c in realistic_comps + qps_comps) + + output = { + "realistic_workload": realistic_comps, + "fixed_qps": qps_comps, + "has_regression": has_regression, + } + print(json.dumps(output, indent=2)) + else: + print_comparison(args.old, args.new, old, new) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/server/test_server.py b/benchmarks/server/test_server.py index 5cdddb3..e747a50 100644 --- a/benchmarks/server/test_server.py +++ b/benchmarks/server/test_server.py @@ -369,19 +369,13 @@ def typical_write() -> Response: def start(self) -> dict[str, Any]: """Start the test server in a background thread.""" - import socket - from werkzeug.serving import make_server - # Find a free port if port is 0 - if self.port == 0: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind((self.host, 0)) - self._actual_port = s.getsockname()[1] - else: - self._actual_port = self.port - - self._server = make_server(self.host, self._actual_port, self.app, threaded=True) + # Let make_server bind to port 0 to get an ephemeral port + # This avoids TOCTOU race where a separate socket finds a free port + # but another process binds to it before we can + self._server = make_server(self.host, self.port, self.app, threaded=True) + self._actual_port = self._server.server_port def run_server(): self._server.serve_forever() From 5d205509fba724e6c1107463327e9291fcf12623 Mon Sep 17 00:00:00 2001 From: JY Tan Date: Mon, 19 Jan 2026 20:05:09 -0800 Subject: [PATCH 3/4] Fix --- .github/workflows/benchmarks.yml | 4 ++++ benchmarks/bench/fixed_qps_latency.py | 4 ++-- benchmarks/bench/realistic_workload.py | 4 ++-- benchmarks/scripts/compare_runs.py | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 7ff1749..87af87b 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -59,6 +59,8 @@ jobs: - name: Run realistic workload benchmark id: realistic + env: + BENCHMARK_ITERATIONS: ${{ inputs.iterations }} run: | uv run python benchmarks/bench/realistic_workload.py 2>&1 | tee realistic_output.txt # Extract just the results JSON @@ -66,6 +68,8 @@ jobs: - name: Run fixed QPS latency benchmark id: fixed_qps + env: + BENCHMARK_QPS_DURATION: ${{ inputs.qps_duration }} run: | uv run python benchmarks/bench/fixed_qps_latency.py 2>&1 | tee fixed_qps_output.txt # Extract just the results JSON diff --git a/benchmarks/bench/fixed_qps_latency.py b/benchmarks/bench/fixed_qps_latency.py index 2be9dda..58bc6f5 100644 --- a/benchmarks/bench/fixed_qps_latency.py +++ b/benchmarks/bench/fixed_qps_latency.py @@ -155,9 +155,9 @@ def main(): print("Endpoint: POST /api/realistic (~10-15ms baseline)") print() - # Test at multiple QPS levels + # Test at multiple QPS levels (duration configurable via BENCHMARK_QPS_DURATION env var) qps_levels = [25, 50, 75] # Requests per second - duration = 10 # seconds per test + duration = int(os.environ.get("BENCHMARK_QPS_DURATION", "10")) # seconds per test results = {"baseline": {}, "sdk_100": {}, "sdk_10": {}} diff --git a/benchmarks/bench/realistic_workload.py b/benchmarks/bench/realistic_workload.py index 77b3c0e..725ec94 100644 --- a/benchmarks/bench/realistic_workload.py +++ b/benchmarks/bench/realistic_workload.py @@ -70,8 +70,8 @@ def run_benchmark_subprocess(mode: str, sampling_rate: float = 1.0) -> dict | No session.post(f"{{server_url}}/api/typical-write", json={{"name": "test"}}) session.post(f"{{server_url}}/api/realistic", json={{"userId": "u1", "query": "test"}}) -# Benchmark parameters -iterations = 200 +# Benchmark parameters (configurable via BENCHMARK_ITERATIONS env var) +iterations = int(os.environ.get("BENCHMARK_ITERATIONS", "200")) results = {{}} # Test 1: Typical Read (~5-10ms baseline) diff --git a/benchmarks/scripts/compare_runs.py b/benchmarks/scripts/compare_runs.py index 18351ea..20ab534 100755 --- a/benchmarks/scripts/compare_runs.py +++ b/benchmarks/scripts/compare_runs.py @@ -3,7 +3,7 @@ Usage: python compare_runs.py results1.json results2.json - python compare_runs.py --download RUN_ID1 RUN_ID2 # Download from GitHub Actions + python compare_runs.py --json results1.json results2.json # JSON output only """ import argparse From 4f0dfac08f2a70260ae62876a32b4ab7f7c4bfc9 Mon Sep 17 00:00:00 2001 From: JY Tan Date: Mon, 19 Jan 2026 20:09:57 -0800 Subject: [PATCH 4/4] Fix --- benchmarks/bench/fixed_qps_latency.py | 5 ++++- benchmarks/bench/realistic_workload.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/benchmarks/bench/fixed_qps_latency.py b/benchmarks/bench/fixed_qps_latency.py index 58bc6f5..b74a2d5 100644 --- a/benchmarks/bench/fixed_qps_latency.py +++ b/benchmarks/bench/fixed_qps_latency.py @@ -157,7 +157,10 @@ def main(): # Test at multiple QPS levels (duration configurable via BENCHMARK_QPS_DURATION env var) qps_levels = [25, 50, 75] # Requests per second - duration = int(os.environ.get("BENCHMARK_QPS_DURATION", "10")) # seconds per test + try: + duration = int(os.environ.get("BENCHMARK_QPS_DURATION", "10")) + except ValueError: + duration = 10 # Fall back to default on invalid input results = {"baseline": {}, "sdk_100": {}, "sdk_10": {}} diff --git a/benchmarks/bench/realistic_workload.py b/benchmarks/bench/realistic_workload.py index 725ec94..7d7edf8 100644 --- a/benchmarks/bench/realistic_workload.py +++ b/benchmarks/bench/realistic_workload.py @@ -71,7 +71,10 @@ def run_benchmark_subprocess(mode: str, sampling_rate: float = 1.0) -> dict | No session.post(f"{{server_url}}/api/realistic", json={{"userId": "u1", "query": "test"}}) # Benchmark parameters (configurable via BENCHMARK_ITERATIONS env var) -iterations = int(os.environ.get("BENCHMARK_ITERATIONS", "200")) +try: + iterations = int(os.environ.get("BENCHMARK_ITERATIONS", "200")) +except ValueError: + iterations = 200 # Fall back to default on invalid input results = {{}} # Test 1: Typical Read (~5-10ms baseline)