Skip to content

Commit 4e40640

Browse files
committed
Fingerprint benchmarks
1 parent ab6731c commit 4e40640

File tree

7 files changed

+1072
-9
lines changed

7 files changed

+1072
-9
lines changed

apps/webapp/app/env.server.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,6 +1195,7 @@ const EnvironmentSchema = z
11951195
RUN_REPLICATION_INSERT_MAX_DELAY_MS: z.coerce.number().int().default(2000),
11961196
RUN_REPLICATION_INSERT_STRATEGY: z.enum(["insert", "insert_async"]).default("insert"),
11971197
RUN_REPLICATION_DISABLE_PAYLOAD_INSERT: z.string().default("0"),
1198+
RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING: z.string().default("0"),
11981199

11991200
// Clickhouse
12001201
CLICKHOUSE_URL: z.string(),

apps/webapp/app/services/runsReplicationInstance.server.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ function initializeRunsReplicationInstance() {
6868
insertMaxDelayMs: env.RUN_REPLICATION_INSERT_MAX_DELAY_MS,
6969
insertStrategy: env.RUN_REPLICATION_INSERT_STRATEGY,
7070
disablePayloadInsert: env.RUN_REPLICATION_DISABLE_PAYLOAD_INSERT === "1",
71+
disableErrorFingerprinting: env.RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING === "1",
7172
});
7273

7374
if (env.RUN_REPLICATION_ENABLED === "1") {

apps/webapp/app/services/runsReplicationService.server.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ export type RunsReplicationServiceOptions = {
7171
insertBaseDelayMs?: number;
7272
insertMaxDelayMs?: number;
7373
disablePayloadInsert?: boolean;
74+
disableErrorFingerprinting?: boolean;
7475
};
7576

7677
type PostgresTaskRun = TaskRun & { masterQueue: string };
@@ -116,6 +117,7 @@ export class RunsReplicationService {
116117
private _insertMaxDelayMs: number;
117118
private _insertStrategy: "insert" | "insert_async";
118119
private _disablePayloadInsert: boolean;
120+
private _disableErrorFingerprinting: boolean;
119121

120122
// Metrics
121123
private _replicationLagHistogram: Histogram;
@@ -190,6 +192,7 @@ export class RunsReplicationService {
190192

191193
this._insertStrategy = options.insertStrategy ?? "insert";
192194
this._disablePayloadInsert = options.disablePayloadInsert ?? false;
195+
this._disableErrorFingerprinting = options.disableErrorFingerprinting ?? false;
193196

194197
this._replicationClient = new LogicalReplicationClient({
195198
pgConfig: {
@@ -856,7 +859,10 @@ export class RunsReplicationService {
856859
const errorData = { data: run.error };
857860

858861
// Calculate error fingerprint for failed runs
859-
const errorFingerprint = (['SYSTEM_FAILURE', 'CRASHED', 'INTERRUPTED', 'COMPLETED_WITH_ERRORS'].includes(run.status))
862+
const errorFingerprint = (
863+
!this._disableErrorFingerprinting &&
864+
['SYSTEM_FAILURE', 'CRASHED', 'INTERRUPTED', 'COMPLETED_WITH_ERRORS'].includes(run.status)
865+
)
860866
? calculateErrorFingerprint(run.error)
861867
: '';
862868

apps/webapp/app/utils/errorFingerprinting.ts

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,15 @@ import { createHash } from "node:crypto";
55
* Groups similar errors together by normalizing dynamic values.
66
*/
77
export function calculateErrorFingerprint(error: unknown): string {
8-
if (!error || typeof error !== "object") return "";
8+
if (!error || typeof error !== "object" || Array.isArray(error)) return "";
99

10+
// This is a but ugly but…
11+
// 1. We can't use a schema here because it's a hot path and needs to be fast.
12+
// 2. It won't be an instanceof Error because it's from the database.
1013
const errorObj = error as any;
11-
const errorType = errorObj.type || errorObj.name || "Error";
12-
const message = errorObj.message || "";
13-
const stack = errorObj.stack || errorObj.stacktrace || "";
14+
const errorType = String(errorObj.type || errorObj.name || "Error");
15+
const message = String(errorObj.message || "");
16+
const stack = String(errorObj.stack || errorObj.stacktrace || "");
1417

1518
// Normalize message to group similar errors
1619
const normalizedMessage = normalizeErrorMessage(message);
@@ -35,10 +38,7 @@ export function normalizeErrorMessage(message: string): string {
3538
return (
3639
message
3740
// UUIDs (8-4-4-4-12 format)
38-
.replace(
39-
/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi,
40-
"<uuid>"
41-
)
41+
.replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, "<uuid>")
4242
// Run IDs (run_xxxxx format)
4343
.replace(/run_[a-zA-Z0-9]+/g, "<run-id>")
4444
// Task run friendly IDs (task_xxxxx or similar)
Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
# RunsReplicationService Error Fingerprinting Benchmark
2+
3+
This benchmark measures the performance impact of error fingerprinting in the RunsReplicationService.
4+
5+
## Overview
6+
7+
The benchmark:
8+
1. Creates a realistic dataset of TaskRuns (7% with errors by default)
9+
2. Runs the producer in a **separate process** to simulate real-world load
10+
3. Measures replication throughput and Event Loop Utilization (ELU)
11+
4. Compares performance with fingerprinting **enabled** vs **disabled**
12+
13+
## Architecture
14+
15+
```
16+
┌─────────────────┐ ┌──────────────────────┐
17+
│ Producer │ │ Benchmark Test │
18+
│ (Child Process)│─────────│ (Main Process) │
19+
│ │ IPC │ │
20+
│ - Inserts │ │ - RunsReplication │
21+
│ TaskRuns │ │ Service │
22+
│ to Postgres │ │ - ELU Monitor │
23+
│ │ │ - Metrics │
24+
└─────────────────┘ └──────────────────────┘
25+
│ │
26+
│ │
27+
▼ ▼
28+
┌──────────┐ ┌──────────────┐
29+
│ Postgres │ │ ClickHouse │
30+
└──────────┘ └──────────────┘
31+
```
32+
33+
## Files
34+
35+
- `runsReplicationBenchmark.test.ts` - Main benchmark test
36+
- `runsReplicationBenchmark.producer.ts` - Producer script (runs in child process)
37+
- `runsReplicationBenchmark.README.md` - This file
38+
39+
## Configuration
40+
41+
The benchmark can be configured via environment variables or by editing `BENCHMARK_CONFIG` in the test file:
42+
43+
```typescript
44+
const BENCHMARK_CONFIG = {
45+
// Number of runs to create
46+
NUM_RUNS: parseInt(process.env.BENCHMARK_NUM_RUNS || "5000", 10),
47+
48+
// Error rate (0.07 = 7%)
49+
ERROR_RATE: 0.07,
50+
51+
// Producer batch size
52+
PRODUCER_BATCH_SIZE: 100,
53+
54+
// Replication service settings
55+
FLUSH_BATCH_SIZE: 50,
56+
FLUSH_INTERVAL_MS: 100,
57+
MAX_FLUSH_CONCURRENCY: 4,
58+
59+
// Timeout
60+
REPLICATION_TIMEOUT_MS: 120_000, // 2 minutes
61+
};
62+
```
63+
64+
## Running the Benchmark
65+
66+
### Quick Test (Small Dataset)
67+
68+
```bash
69+
cd apps/webapp
70+
BENCHMARK_NUM_RUNS=1000 pnpm run test ./test/runsReplicationBenchmark.test.ts --run
71+
```
72+
73+
### Realistic Benchmark (Larger Dataset)
74+
75+
```bash
76+
cd apps/webapp
77+
BENCHMARK_NUM_RUNS=10000 pnpm run test ./test/runsReplicationBenchmark.test.ts --run
78+
```
79+
80+
### High Volume Benchmark
81+
82+
```bash
83+
cd apps/webapp
84+
BENCHMARK_NUM_RUNS=50000 pnpm run test ./test/runsReplicationBenchmark.test.ts --run
85+
```
86+
87+
**Note:** The test is marked with `.skip` by default. To run it, remove the `.skip` from the test:
88+
89+
```typescript
90+
// Change this:
91+
containerTest.skip("should benchmark...", async () => {
92+
93+
// To this:
94+
containerTest("should benchmark...", async () => {
95+
```
96+
97+
## What Gets Measured
98+
99+
### 1. Producer Metrics
100+
- Total runs created
101+
- Runs with errors (should be ~7%)
102+
- Duration
103+
- Throughput (runs/sec)
104+
105+
### 2. Replication Metrics
106+
- Total runs replicated to ClickHouse
107+
- Replication duration
108+
- Replication throughput (runs/sec)
109+
110+
### 3. Event Loop Utilization (ELU)
111+
- Mean utilization (%)
112+
- P50 (median) utilization (%)
113+
- P95 utilization (%)
114+
- P99 utilization (%)
115+
- All samples for detailed analysis
116+
117+
### 4. OpenTelemetry Metrics
118+
- Batches flushed
119+
- Task runs inserted
120+
- Payloads inserted
121+
- Events processed
122+
123+
## Output
124+
125+
The benchmark produces detailed output including:
126+
127+
```
128+
================================================================================
129+
BENCHMARK: baseline-no-fingerprinting
130+
Error Fingerprinting: DISABLED
131+
Runs: 5000, Error Rate: 7.0%
132+
================================================================================
133+
134+
[Producer] Starting - will create 5000 runs (7.0% with errors)
135+
[Producer] Progress: 1000/5000 runs (2500 runs/sec)
136+
...
137+
[Producer] Completed:
138+
- Total runs: 5000
139+
- With errors: 352 (7.0%)
140+
- Duration: 2145ms
141+
- Throughput: 2331 runs/sec
142+
143+
[Benchmark] Waiting for replication to complete...
144+
145+
================================================================================
146+
RESULTS: baseline-no-fingerprinting
147+
================================================================================
148+
149+
Producer:
150+
Created: 5000 runs
151+
With errors: 352 (7.0%)
152+
Duration: 2145ms
153+
Throughput: 2331 runs/sec
154+
155+
Replication:
156+
Replicated: 5000 runs
157+
Duration: 3456ms
158+
Throughput: 1447 runs/sec
159+
160+
Event Loop Utilization:
161+
Mean: 23.45%
162+
P50: 22.10%
163+
P95: 34.20%
164+
P99: 41.30%
165+
Samples: 346
166+
167+
Metrics:
168+
Batches flushed: 102
169+
Task runs inserted: 5000
170+
Payloads inserted: 5000
171+
Events processed: 5000
172+
================================================================================
173+
174+
[... Similar output for "with-fingerprinting" benchmark ...]
175+
176+
================================================================================
177+
COMPARISON
178+
Baseline: baseline-no-fingerprinting (fingerprinting OFF)
179+
Comparison: with-fingerprinting (fingerprinting ON)
180+
================================================================================
181+
182+
Replication Duration:
183+
3456ms → 3512ms (+1.62%)
184+
185+
Throughput:
186+
14471424 runs/sec (-1.59%)
187+
188+
Event Loop Utilization (Mean):
189+
23.45%24.12% (+2.86%)
190+
191+
Event Loop Utilization (P99):
192+
41.30%43.20% (+4.60%)
193+
194+
================================================================================
195+
196+
BENCHMARK COMPLETE
197+
Fingerprinting impact on replication duration: +1.62%
198+
Fingerprinting impact on throughput: -1.59%
199+
Fingerprinting impact on ELU (mean): +2.86%
200+
Fingerprinting impact on ELU (P99): +4.60%
201+
```
202+
203+
## Interpreting Results
204+
205+
### What to Look For
206+
207+
1. **Replication Duration Delta** - How much longer replication takes with fingerprinting
208+
2. **Throughput Delta** - Change in runs processed per second
209+
3. **ELU Delta** - Change in event loop utilization (higher = more CPU bound)
210+
211+
### Expected Results
212+
213+
With a 7% error rate and SHA-256 hashing:
214+
- **Small impact** (<5% overhead): Fingerprinting is well optimized
215+
- **Moderate impact** (5-15% overhead): May want to consider optimizations
216+
- **Large impact** (>15% overhead): Fingerprinting needs optimization
217+
218+
### Performance Optimization Ideas
219+
220+
If the benchmark shows significant overhead, consider:
221+
222+
1. **Faster hashing algorithm** - Replace SHA-256 with xxHash or MurmurHash3
223+
2. **Worker threads** - Move fingerprinting to worker threads
224+
3. **Caching** - Cache fingerprints for identical errors
225+
4. **Lazy computation** - Only compute fingerprints when needed
226+
5. **Batch processing** - Group similar errors before hashing
227+
228+
## Dataset Characteristics
229+
230+
The producer generates realistic error variety:
231+
232+
- TypeError (undefined property access)
233+
- Error (API fetch failures)
234+
- ValidationError (input validation)
235+
- TimeoutError (operation timeouts)
236+
- DatabaseError (connection failures)
237+
- ReferenceError (undefined variables)
238+
239+
Each error template includes:
240+
- Realistic stack traces
241+
- Variable IDs and timestamps
242+
- Line/column numbers
243+
- File paths
244+
245+
This ensures the fingerprinting algorithm is tested with realistic data.
246+
247+
## Troubleshooting
248+
249+
### Benchmark Times Out
250+
251+
Increase the timeout:
252+
```typescript
253+
REPLICATION_TIMEOUT_MS: 300_000, // 5 minutes
254+
```
255+
256+
### Producer Fails
257+
258+
Check Postgres connection and ensure:
259+
- Docker services are running (`pnpm run docker`)
260+
- Database is accessible
261+
- Sufficient disk space
262+
263+
### Different Results Each Run
264+
265+
This is normal! Factors affecting variance:
266+
- System load
267+
- Docker container overhead
268+
- Database I/O
269+
- Network latency (even localhost)
270+
271+
Run multiple times and look at trends.
272+
273+
## Future Enhancements
274+
275+
Potential improvements to the benchmark:
276+
277+
1. **Multiple error rates** - Test 0%, 5%, 10%, 25%, 50% error rates
278+
2. **Different hash algorithms** - Compare SHA-256 vs xxHash vs MurmurHash3
279+
3. **Worker thread comparison** - Test main thread vs worker threads
280+
4. **Concurrent producers** - Multiple producer processes
281+
5. **Memory profiling** - Track memory usage over time
282+
6. **Flame graphs** - Generate CPU flame graphs for analysis
283+
7. **Historical tracking** - Store results over time to track regressions

0 commit comments

Comments
 (0)