Fingerprint benchmarks

matt-aitken · matt-aitken · commit 4e40640814f7 · 2026-03-02T15:22:34.000Z
diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts
@@ -1195,6 +1195,7 @@ const EnvironmentSchema = z
     RUN_REPLICATION_INSERT_MAX_DELAY_MS: z.coerce.number().int().default(2000),
     RUN_REPLICATION_INSERT_STRATEGY: z.enum(["insert", "insert_async"]).default("insert"),
     RUN_REPLICATION_DISABLE_PAYLOAD_INSERT: z.string().default("0"),
+    RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING: z.string().default("0"),
 
     // Clickhouse
     CLICKHOUSE_URL: z.string(),
diff --git a/apps/webapp/app/services/runsReplicationInstance.server.ts b/apps/webapp/app/services/runsReplicationInstance.server.ts
@@ -68,6 +68,7 @@ function initializeRunsReplicationInstance() {
     insertMaxDelayMs: env.RUN_REPLICATION_INSERT_MAX_DELAY_MS,
     insertStrategy: env.RUN_REPLICATION_INSERT_STRATEGY,
     disablePayloadInsert: env.RUN_REPLICATION_DISABLE_PAYLOAD_INSERT === "1",
+    disableErrorFingerprinting: env.RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING === "1",
   });
 
   if (env.RUN_REPLICATION_ENABLED === "1") {
diff --git a/apps/webapp/app/services/runsReplicationService.server.ts b/apps/webapp/app/services/runsReplicationService.server.ts
@@ -71,6 +71,7 @@ export type RunsReplicationServiceOptions = {
   insertBaseDelayMs?: number;
   insertMaxDelayMs?: number;
   disablePayloadInsert?: boolean;
+  disableErrorFingerprinting?: boolean;
 };
 
 type PostgresTaskRun = TaskRun & { masterQueue: string };
@@ -116,6 +117,7 @@ export class RunsReplicationService {
   private _insertMaxDelayMs: number;
   private _insertStrategy: "insert" | "insert_async";
   private _disablePayloadInsert: boolean;
+  private _disableErrorFingerprinting: boolean;
 
   // Metrics
   private _replicationLagHistogram: Histogram;
@@ -190,6 +192,7 @@ export class RunsReplicationService {
 
     this._insertStrategy = options.insertStrategy ?? "insert";
     this._disablePayloadInsert = options.disablePayloadInsert ?? false;
+    this._disableErrorFingerprinting = options.disableErrorFingerprinting ?? false;
 
     this._replicationClient = new LogicalReplicationClient({
       pgConfig: {
@@ -856,7 +859,10 @@ export class RunsReplicationService {
     const errorData = { data: run.error };
 
     // Calculate error fingerprint for failed runs
-    const errorFingerprint = (['SYSTEM_FAILURE', 'CRASHED', 'INTERRUPTED', 'COMPLETED_WITH_ERRORS'].includes(run.status))
+    const errorFingerprint = (
+      !this._disableErrorFingerprinting &&
+      ['SYSTEM_FAILURE', 'CRASHED', 'INTERRUPTED', 'COMPLETED_WITH_ERRORS'].includes(run.status)
+    )
       ? calculateErrorFingerprint(run.error)
       : '';
 
diff --git a/apps/webapp/app/utils/errorFingerprinting.ts b/apps/webapp/app/utils/errorFingerprinting.ts
@@ -5,12 +5,15 @@ import { createHash } from "node:crypto";
  * Groups similar errors together by normalizing dynamic values.
  */
 export function calculateErrorFingerprint(error: unknown): string {
-  if (!error || typeof error !== "object") return "";
+  if (!error || typeof error !== "object" || Array.isArray(error)) return "";
 
+  // This is a but ugly but…
+  // 1. We can't use a schema here because it's a hot path and needs to be fast.
+  // 2. It won't be an instanceof Error because it's from the database.
   const errorObj = error as any;
-  const errorType = errorObj.type || errorObj.name || "Error";
-  const message = errorObj.message || "";
-  const stack = errorObj.stack || errorObj.stacktrace || "";
+  const errorType = String(errorObj.type || errorObj.name || "Error");
+  const message = String(errorObj.message || "");
+  const stack = String(errorObj.stack || errorObj.stacktrace || "");
 
   // Normalize message to group similar errors
   const normalizedMessage = normalizeErrorMessage(message);
@@ -35,10 +38,7 @@ export function normalizeErrorMessage(message: string): string {
   return (
     message
       // UUIDs (8-4-4-4-12 format)
-      .replace(
-        /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi,
-        "<uuid>"
-      )
+      .replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, "<uuid>")
       // Run IDs (run_xxxxx format)
       .replace(/run_[a-zA-Z0-9]+/g, "<run-id>")
       // Task run friendly IDs (task_xxxxx or similar)
diff --git a/apps/webapp/test/runsReplicationBenchmark.README.md b/apps/webapp/test/runsReplicationBenchmark.README.md
@@ -0,0 +1,283 @@
+# RunsReplicationService Error Fingerprinting Benchmark
+
+This benchmark measures the performance impact of error fingerprinting in the RunsReplicationService.
+
+## Overview
+
+The benchmark:
+1. Creates a realistic dataset of TaskRuns (7% with errors by default)
+2. Runs the producer in a **separate process** to simulate real-world load
+3. Measures replication throughput and Event Loop Utilization (ELU)
+4. Compares performance with fingerprinting **enabled** vs **disabled**
+
+## Architecture
+
+```
+┌─────────────────┐         ┌──────────────────────┐
+│  Producer       │         │  Benchmark Test      │
+│  (Child Process)│─────────│  (Main Process)      │
+│                 │  IPC    │                      │
+│  - Inserts      │         │  - RunsReplication   │
+│    TaskRuns     │         │    Service           │
+│    to Postgres  │         │  - ELU Monitor       │
+│                 │         │  - Metrics           │
+└─────────────────┘         └──────────────────────┘
+         │                           │
+         │                           │
+         ▼                           ▼
+    ┌──────────┐              ┌──────────────┐
+    │ Postgres │              │  ClickHouse  │
+    └──────────┘              └──────────────┘
+```
+
+## Files
+
+- `runsReplicationBenchmark.test.ts` - Main benchmark test
+- `runsReplicationBenchmark.producer.ts` - Producer script (runs in child process)
+- `runsReplicationBenchmark.README.md` - This file
+
+## Configuration
+
+The benchmark can be configured via environment variables or by editing `BENCHMARK_CONFIG` in the test file:
+
+```typescript
+const BENCHMARK_CONFIG = {
+  // Number of runs to create
+  NUM_RUNS: parseInt(process.env.BENCHMARK_NUM_RUNS || "5000", 10),
+
+  // Error rate (0.07 = 7%)
+  ERROR_RATE: 0.07,
+
+  // Producer batch size
+  PRODUCER_BATCH_SIZE: 100,
+
+  // Replication service settings
+  FLUSH_BATCH_SIZE: 50,
+  FLUSH_INTERVAL_MS: 100,
+  MAX_FLUSH_CONCURRENCY: 4,
+
+  // Timeout
+  REPLICATION_TIMEOUT_MS: 120_000, // 2 minutes
+};
+```
+
+## Running the Benchmark
+
+### Quick Test (Small Dataset)
+
+```bash
+cd apps/webapp
+BENCHMARK_NUM_RUNS=1000 pnpm run test ./test/runsReplicationBenchmark.test.ts --run
+```
+
+### Realistic Benchmark (Larger Dataset)
+
+```bash
+cd apps/webapp
+BENCHMARK_NUM_RUNS=10000 pnpm run test ./test/runsReplicationBenchmark.test.ts --run
+```
+
+### High Volume Benchmark
+
+```bash
+cd apps/webapp
+BENCHMARK_NUM_RUNS=50000 pnpm run test ./test/runsReplicationBenchmark.test.ts --run
+```
+
+**Note:** The test is marked with `.skip` by default. To run it, remove the `.skip` from the test:
+
+```typescript
+// Change this:
+containerTest.skip("should benchmark...", async () => {
+
+// To this:
+containerTest("should benchmark...", async () => {
+```
+
+## What Gets Measured
+
+### 1. Producer Metrics
+- Total runs created
+- Runs with errors (should be ~7%)
+- Duration
+- Throughput (runs/sec)
+
+### 2. Replication Metrics
+- Total runs replicated to ClickHouse
+- Replication duration
+- Replication throughput (runs/sec)
+
+### 3. Event Loop Utilization (ELU)
+- Mean utilization (%)
+- P50 (median) utilization (%)
+- P95 utilization (%)
+- P99 utilization (%)
+- All samples for detailed analysis
+
+### 4. OpenTelemetry Metrics
+- Batches flushed
+- Task runs inserted
+- Payloads inserted
+- Events processed
+
+## Output
+
+The benchmark produces detailed output including:
+
+```
+================================================================================
+BENCHMARK: baseline-no-fingerprinting
+Error Fingerprinting: DISABLED
+Runs: 5000, Error Rate: 7.0%
+================================================================================
+
+[Producer] Starting - will create 5000 runs (7.0% with errors)
+[Producer] Progress: 1000/5000 runs (2500 runs/sec)
+...
+[Producer] Completed:
+  - Total runs: 5000
+  - With errors: 352 (7.0%)
+  - Duration: 2145ms
+  - Throughput: 2331 runs/sec
+
+[Benchmark] Waiting for replication to complete...
+
+================================================================================
+RESULTS: baseline-no-fingerprinting
+================================================================================
+
+Producer:
+  Created: 5000 runs
+  With errors: 352 (7.0%)
+  Duration: 2145ms
+  Throughput: 2331 runs/sec
+
+Replication:
+  Replicated: 5000 runs
+  Duration: 3456ms
+  Throughput: 1447 runs/sec
+
+Event Loop Utilization:
+  Mean: 23.45%
+  P50: 22.10%
+  P95: 34.20%
+  P99: 41.30%
+  Samples: 346
+
+Metrics:
+  Batches flushed: 102
+  Task runs inserted: 5000
+  Payloads inserted: 5000
+  Events processed: 5000
+================================================================================
+
+[... Similar output for "with-fingerprinting" benchmark ...]
+
+================================================================================
+COMPARISON
+Baseline: baseline-no-fingerprinting (fingerprinting OFF)
+Comparison: with-fingerprinting (fingerprinting ON)
+================================================================================
+
+Replication Duration:
+  3456ms → 3512ms (+1.62%)
+
+Throughput:
+  1447 → 1424 runs/sec (-1.59%)
+
+Event Loop Utilization (Mean):
+  23.45% → 24.12% (+2.86%)
+
+Event Loop Utilization (P99):
+  41.30% → 43.20% (+4.60%)
+
+================================================================================
+
+BENCHMARK COMPLETE
+Fingerprinting impact on replication duration: +1.62%
+Fingerprinting impact on throughput: -1.59%
+Fingerprinting impact on ELU (mean): +2.86%
+Fingerprinting impact on ELU (P99): +4.60%
+```
+
+## Interpreting Results
+
+### What to Look For
+
+1. **Replication Duration Delta** - How much longer replication takes with fingerprinting
+2. **Throughput Delta** - Change in runs processed per second
+3. **ELU Delta** - Change in event loop utilization (higher = more CPU bound)
+
+### Expected Results
+
+With a 7% error rate and SHA-256 hashing:
+- **Small impact** (<5% overhead): Fingerprinting is well optimized
+- **Moderate impact** (5-15% overhead): May want to consider optimizations
+- **Large impact** (>15% overhead): Fingerprinting needs optimization
+
+### Performance Optimization Ideas
+
+If the benchmark shows significant overhead, consider:
+
+1. **Faster hashing algorithm** - Replace SHA-256 with xxHash or MurmurHash3
+2. **Worker threads** - Move fingerprinting to worker threads
+3. **Caching** - Cache fingerprints for identical errors
+4. **Lazy computation** - Only compute fingerprints when needed
+5. **Batch processing** - Group similar errors before hashing
+
+## Dataset Characteristics
+
+The producer generates realistic error variety:
+
+- TypeError (undefined property access)
+- Error (API fetch failures)
+- ValidationError (input validation)
+- TimeoutError (operation timeouts)
+- DatabaseError (connection failures)
+- ReferenceError (undefined variables)
+
+Each error template includes:
+- Realistic stack traces
+- Variable IDs and timestamps
+- Line/column numbers
+- File paths
+
+This ensures the fingerprinting algorithm is tested with realistic data.
+
+## Troubleshooting
+
+### Benchmark Times Out
+
+Increase the timeout:
+```typescript
+REPLICATION_TIMEOUT_MS: 300_000, // 5 minutes
+```
+
+### Producer Fails
+
+Check Postgres connection and ensure:
+- Docker services are running (`pnpm run docker`)
+- Database is accessible
+- Sufficient disk space
+
+### Different Results Each Run
+
+This is normal! Factors affecting variance:
+- System load
+- Docker container overhead
+- Database I/O
+- Network latency (even localhost)
+
+Run multiple times and look at trends.
+
+## Future Enhancements
+
+Potential improvements to the benchmark:
+
+1. **Multiple error rates** - Test 0%, 5%, 10%, 25%, 50% error rates
+2. **Different hash algorithms** - Compare SHA-256 vs xxHash vs MurmurHash3
+3. **Worker thread comparison** - Test main thread vs worker threads
+4. **Concurrent producers** - Multiple producer processes
+5. **Memory profiling** - Track memory usage over time
+6. **Flame graphs** - Generate CPU flame graphs for analysis
+7. **Historical tracking** - Store results over time to track regressions
diff --git a/apps/webapp/test/runsReplicationBenchmark.producer.ts b/apps/webapp/test/runsReplicationBenchmark.producer.ts
diff --git a/apps/webapp/test/runsReplicationBenchmark.test.ts b/apps/webapp/test/runsReplicationBenchmark.test.ts