Skip to content

Commit e9fc7af

Browse files
committed
Merge remote-tracking branch 'origin/main' into feat/arm64-registry
2 parents eb99545 + 08d84eb commit e9fc7af

26 files changed

+1584
-69
lines changed

apps/webapp/app/env.server.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,10 @@ const EnvironmentSchema = z.object({
430430
RUN_ENGINE_PROCESS_WORKER_QUEUE_DEBOUNCE_MS: z.coerce.number().int().default(200),
431431
RUN_ENGINE_DEQUEUE_BLOCKING_TIMEOUT_SECONDS: z.coerce.number().int().default(10),
432432
RUN_ENGINE_MASTER_QUEUE_CONSUMERS_INTERVAL_MS: z.coerce.number().int().default(500),
433+
RUN_ENGINE_CONCURRENCY_SWEEPER_SCAN_SCHEDULE: z.string().optional(),
434+
RUN_ENGINE_CONCURRENCY_SWEEPER_PROCESS_MARKED_SCHEDULE: z.string().optional(),
435+
RUN_ENGINE_CONCURRENCY_SWEEPER_SCAN_JITTER_IN_MS: z.coerce.number().int().optional(),
436+
RUN_ENGINE_CONCURRENCY_SWEEPER_PROCESS_MARKED_JITTER_IN_MS: z.coerce.number().int().optional(),
433437

434438
RUN_ENGINE_RUN_LOCK_DURATION: z.coerce.number().int().default(5000),
435439
RUN_ENGINE_RUN_LOCK_AUTOMATIC_EXTENSION_THRESHOLD: z.coerce.number().int().default(1000),
@@ -595,6 +599,7 @@ const EnvironmentSchema = z.object({
595599

596600
RUN_ENGINE_WORKER_ENABLED: z.string().default("1"),
597601
RUN_ENGINE_WORKER_LOG_LEVEL: z.enum(["log", "error", "warn", "info", "debug"]).default("info"),
602+
RUN_ENGINE_RUN_QUEUE_LOG_LEVEL: z.enum(["log", "error", "warn", "info", "debug"]).default("info"),
598603

599604
/** How long should the presence ttl last */
600605
DEV_PRESENCE_SSE_TIMEOUT: z.coerce.number().int().default(30_000),
@@ -839,9 +844,13 @@ const EnvironmentSchema = z.object({
839844
RUN_REPLICATION_LEADER_LOCK_ADDITIONAL_TIME_MS: z.coerce.number().int().default(10_000),
840845
RUN_REPLICATION_LEADER_LOCK_RETRY_INTERVAL_MS: z.coerce.number().int().default(500),
841846
RUN_REPLICATION_WAIT_FOR_ASYNC_INSERT: z.string().default("0"),
842-
RUN_REPLICATION_KEEP_ALIVE_ENABLED: z.string().default("1"),
847+
RUN_REPLICATION_KEEP_ALIVE_ENABLED: z.string().default("0"),
843848
RUN_REPLICATION_KEEP_ALIVE_IDLE_SOCKET_TTL_MS: z.coerce.number().int().optional(),
844849
RUN_REPLICATION_MAX_OPEN_CONNECTIONS: z.coerce.number().int().default(10),
850+
// Retry configuration for insert operations
851+
RUN_REPLICATION_INSERT_MAX_RETRIES: z.coerce.number().int().default(3),
852+
RUN_REPLICATION_INSERT_BASE_DELAY_MS: z.coerce.number().int().default(100),
853+
RUN_REPLICATION_INSERT_MAX_DELAY_MS: z.coerce.number().int().default(2000),
845854

846855
// Clickhouse
847856
CLICKHOUSE_URL: z.string().optional(),

apps/webapp/app/services/runsReplicationInstance.server.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ function initializeRunsReplicationInstance() {
6262
logLevel: env.RUN_REPLICATION_LOG_LEVEL,
6363
waitForAsyncInsert: env.RUN_REPLICATION_WAIT_FOR_ASYNC_INSERT === "1",
6464
tracer: provider.getTracer("runs-replication-service"),
65+
insertMaxRetries: env.RUN_REPLICATION_INSERT_MAX_RETRIES,
66+
insertBaseDelayMs: env.RUN_REPLICATION_INSERT_BASE_DELAY_MS,
67+
insertMaxDelayMs: env.RUN_REPLICATION_INSERT_MAX_DELAY_MS,
6568
});
6669

6770
if (env.RUN_REPLICATION_ENABLED === "1") {

apps/webapp/app/services/runsReplicationService.server.ts

Lines changed: 122 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ import { TaskRun } from "@trigger.dev/database";
1515
import { nanoid } from "nanoid";
1616
import EventEmitter from "node:events";
1717
import pLimit from "p-limit";
18+
import { logger } from "./logger.server";
19+
import { detectBadJsonStrings } from "~/utils/detectBadJsonStrings";
1820

1921
interface TransactionEvent<T = any> {
2022
tag: "insert" | "update" | "delete";
@@ -51,6 +53,10 @@ export type RunsReplicationServiceOptions = {
5153
logLevel?: LogLevel;
5254
tracer?: Tracer;
5355
waitForAsyncInsert?: boolean;
56+
// Retry configuration for insert operations
57+
insertMaxRetries?: number;
58+
insertBaseDelayMs?: number;
59+
insertMaxDelayMs?: number;
5460
};
5561

5662
type TaskRunInsert = { _version: bigint; run: TaskRun; event: "insert" | "update" | "delete" };
@@ -80,6 +86,10 @@ export class RunsReplicationService {
8086
private _latestCommitEndLsn: string | null = null;
8187
private _lastAcknowledgedLsn: string | null = null;
8288
private _acknowledgeInterval: NodeJS.Timeout | null = null;
89+
// Retry configuration
90+
private _insertMaxRetries: number;
91+
private _insertBaseDelayMs: number;
92+
private _insertMaxDelayMs: number;
8393

8494
public readonly events: EventEmitter<RunsReplicationServiceEvents>;
8595

@@ -151,6 +161,11 @@ export class RunsReplicationService {
151161
this._replicationClient.events.on("leaderElection", (isLeader) => {
152162
this.logger.info("Leader election", { isLeader });
153163
});
164+
165+
// Initialize retry configuration
166+
this._insertMaxRetries = options.insertMaxRetries ?? 3;
167+
this._insertBaseDelayMs = options.insertBaseDelayMs ?? 100;
168+
this._insertMaxDelayMs = options.insertMaxDelayMs ?? 2000;
154169
}
155170

156171
public async shutdown() {
@@ -445,8 +460,37 @@ export class RunsReplicationService {
445460
payloadInserts: payloadInserts.length,
446461
});
447462

448-
await this.#insertTaskRunInserts(taskRunInserts);
449-
await this.#insertPayloadInserts(payloadInserts);
463+
// Insert task runs and payloads with retry logic for connection errors
464+
const [taskRunError, taskRunResult] = await this.#insertWithRetry(
465+
() => this.#insertTaskRunInserts(taskRunInserts),
466+
"task run inserts",
467+
flushId
468+
);
469+
470+
const [payloadError, payloadResult] = await this.#insertWithRetry(
471+
() => this.#insertPayloadInserts(payloadInserts),
472+
"payload inserts",
473+
flushId
474+
);
475+
476+
// Log any errors that occurred
477+
if (taskRunError) {
478+
this.logger.error("Error inserting task run inserts", {
479+
error: taskRunError,
480+
flushId,
481+
runIds: taskRunInserts.map((r) => r.run_id),
482+
});
483+
recordSpanError(span, taskRunError);
484+
}
485+
486+
if (payloadError) {
487+
this.logger.error("Error inserting payload inserts", {
488+
error: payloadError,
489+
flushId,
490+
runIds: payloadInserts.map((r) => r.run_id),
491+
});
492+
recordSpanError(span, payloadError);
493+
}
450494

451495
this.logger.debug("Flushed inserts", {
452496
flushId,
@@ -456,6 +500,73 @@ export class RunsReplicationService {
456500
});
457501
}
458502

503+
// New method to handle inserts with retry logic for connection errors
504+
async #insertWithRetry<T>(
505+
insertFn: () => Promise<T>,
506+
operationName: string,
507+
flushId: string
508+
): Promise<[Error | null, T | null]> {
509+
let lastError: Error | null = null;
510+
511+
for (let attempt = 1; attempt <= this._insertMaxRetries; attempt++) {
512+
try {
513+
const result = await insertFn();
514+
return [null, result];
515+
} catch (error) {
516+
lastError = error instanceof Error ? error : new Error(String(error));
517+
518+
// Check if this is a retryable connection error
519+
if (this.#isRetryableConnectionError(lastError) && attempt < this._insertMaxRetries) {
520+
const delay = this.#calculateConnectionRetryDelay(attempt);
521+
522+
this.logger.warn(`Retrying ${operationName} due to connection error`, {
523+
flushId,
524+
attempt,
525+
maxRetries: this._insertMaxRetries,
526+
error: lastError.message,
527+
delay,
528+
});
529+
530+
await new Promise((resolve) => setTimeout(resolve, delay));
531+
continue;
532+
}
533+
break;
534+
}
535+
}
536+
537+
return [lastError, null];
538+
}
539+
540+
// New method to check if an error is a retryable connection error
541+
#isRetryableConnectionError(error: Error): boolean {
542+
const errorMessage = error.message.toLowerCase();
543+
const retryableConnectionPatterns = [
544+
"socket hang up",
545+
"econnreset",
546+
"connection reset",
547+
"connection refused",
548+
"connection timeout",
549+
"network error",
550+
"read econnreset",
551+
"write econnreset",
552+
];
553+
554+
return retryableConnectionPatterns.some((pattern) => errorMessage.includes(pattern));
555+
}
556+
557+
// New method to calculate retry delay for connection errors
558+
#calculateConnectionRetryDelay(attempt: number): number {
559+
// Exponential backoff: baseDelay, baseDelay*2, baseDelay*4, etc.
560+
const delay = Math.min(
561+
this._insertBaseDelayMs * Math.pow(2, attempt - 1),
562+
this._insertMaxDelayMs
563+
);
564+
565+
// Add some jitter to prevent thundering herd
566+
const jitter = Math.random() * 100;
567+
return delay + jitter;
568+
}
569+
459570
async #insertTaskRunInserts(taskRunInserts: TaskRunV2[]) {
460571
return await startSpan(this._tracer, "insertTaskRunsInserts", async (span) => {
461572
const [insertError, insertResult] = await this.options.clickhouse.taskRuns.insert(
@@ -604,6 +715,7 @@ export class RunsReplicationService {
604715
idempotency_key: run.idempotencyKey ?? "",
605716
expiration_ttl: run.ttl ?? "",
606717
output,
718+
concurrency_key: run.concurrencyKey ?? "",
607719
_version: _version.toString(),
608720
_is_deleted: event === "delete" ? 1 : 0,
609721
};
@@ -631,6 +743,14 @@ export class RunsReplicationService {
631743
return { data: undefined };
632744
}
633745

746+
if (detectBadJsonStrings(data)) {
747+
this.logger.warn("Detected bad JSON strings", {
748+
data,
749+
dataType,
750+
});
751+
return { data: undefined };
752+
}
753+
634754
const packet = {
635755
data,
636756
dataType,
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
export function detectBadJsonStrings(jsonString: string): boolean {
2+
// Fast path: skip everything if no \u
3+
let idx = jsonString.indexOf("\\u");
4+
if (idx === -1) return false;
5+
6+
// Use a more efficient scanning strategy
7+
const length = jsonString.length;
8+
9+
while (idx !== -1 && idx < length - 5) {
10+
// Only check if we have enough characters left
11+
if (idx + 6 > length) break;
12+
13+
if (jsonString[idx + 1] === "u" && jsonString[idx + 2] === "d") {
14+
const third = jsonString[idx + 3];
15+
16+
// High surrogate check
17+
if (
18+
/[89ab]/.test(third) &&
19+
/[0-9a-f]/.test(jsonString[idx + 4]) &&
20+
/[0-9a-f]/.test(jsonString[idx + 5])
21+
) {
22+
// Check for low surrogate after (need at least 6 more chars)
23+
if (idx + 12 > length) {
24+
return true; // Incomplete high surrogate (not enough chars left)
25+
}
26+
27+
if (
28+
jsonString[idx + 6] !== "\\" ||
29+
jsonString[idx + 7] !== "u" ||
30+
jsonString[idx + 8] !== "d" ||
31+
!/[cd]/.test(jsonString[idx + 9]) ||
32+
!/[0-9a-f]/.test(jsonString[idx + 10]) ||
33+
!/[0-9a-f]/.test(jsonString[idx + 11])
34+
) {
35+
return true; // Incomplete high surrogate
36+
}
37+
}
38+
39+
// Low surrogate check
40+
if (
41+
(third === "c" || third === "d") &&
42+
/[0-9a-f]/.test(jsonString[idx + 4]) &&
43+
/[0-9a-f]/.test(jsonString[idx + 5])
44+
) {
45+
// Check for high surrogate before (need at least 6 chars before)
46+
if (idx < 6) {
47+
return true; // Incomplete low surrogate (not enough chars before)
48+
}
49+
50+
if (
51+
jsonString[idx - 6] !== "\\" ||
52+
jsonString[idx - 5] !== "u" ||
53+
jsonString[idx - 4] !== "d" ||
54+
!/[89ab]/.test(jsonString[idx - 3]) ||
55+
!/[0-9a-f]/.test(jsonString[idx - 2]) ||
56+
!/[0-9a-f]/.test(jsonString[idx - 1])
57+
) {
58+
return true; // Incomplete low surrogate
59+
}
60+
}
61+
}
62+
63+
// More efficient next search - skip ahead by 2 to avoid overlapping matches
64+
idx = jsonString.indexOf("\\u", idx + 2);
65+
}
66+
67+
return false;
68+
}

apps/webapp/app/v3/runEngine.server.ts

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import { RunEngine } from "@internal/run-engine";
2-
import { defaultMachine } from "~/services/platform.v3.server";
3-
import { prisma } from "~/db.server";
2+
import { $replica, prisma } from "~/db.server";
43
import { env } from "~/env.server";
4+
import { defaultMachine } from "~/services/platform.v3.server";
55
import { singleton } from "~/utils/singleton";
66
import { allMachines } from "./machinePresets.server";
7-
import { tracer, meter } from "./tracer.server";
7+
import { meter, tracer } from "./tracer.server";
88

99
export const engine = singleton("RunEngine", createRunEngine);
1010

@@ -13,6 +13,7 @@ export type { RunEngine };
1313
function createRunEngine() {
1414
const engine = new RunEngine({
1515
prisma,
16+
readOnlyPrisma: $replica,
1617
logLevel: env.RUN_ENGINE_WORKER_LOG_LEVEL,
1718
worker: {
1819
disabled: env.RUN_ENGINE_WORKER_ENABLED === "0",
@@ -39,6 +40,7 @@ function createRunEngine() {
3940
},
4041
queue: {
4142
defaultEnvConcurrency: env.DEFAULT_ENV_EXECUTION_CONCURRENCY_LIMIT,
43+
logLevel: env.RUN_ENGINE_RUN_QUEUE_LOG_LEVEL,
4244
redis: {
4345
keyPrefix: "engine:",
4446
port: env.RUN_ENGINE_RUN_QUEUE_REDIS_PORT ?? undefined,
@@ -64,6 +66,12 @@ function createRunEngine() {
6466
dequeueBlockingTimeoutSeconds: env.RUN_ENGINE_DEQUEUE_BLOCKING_TIMEOUT_SECONDS,
6567
masterQueueConsumersIntervalMs: env.RUN_ENGINE_MASTER_QUEUE_CONSUMERS_INTERVAL_MS,
6668
masterQueueConsumersDisabled: env.RUN_ENGINE_WORKER_ENABLED === "0",
69+
concurrencySweeper: {
70+
scanSchedule: env.RUN_ENGINE_CONCURRENCY_SWEEPER_SCAN_SCHEDULE,
71+
processMarkedSchedule: env.RUN_ENGINE_CONCURRENCY_SWEEPER_PROCESS_MARKED_SCHEDULE,
72+
scanJitterInMs: env.RUN_ENGINE_CONCURRENCY_SWEEPER_SCAN_JITTER_IN_MS,
73+
processMarkedJitterInMs: env.RUN_ENGINE_CONCURRENCY_SWEEPER_PROCESS_MARKED_JITTER_IN_MS,
74+
},
6775
},
6876
runLock: {
6977
redis: {
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"title": "❜ 𝐒 𝐏𝗈𝗌𝗍 . . . 𝐍𝖾𝗐 𝐂𝗈𝗇𝗍𝖾𝗇𝗍 ꒰ ⚔️ ꒱ 𝐒𝐋 ❜ 𝐔𝐋\n\n꒰ ❤️ ꒱ 𓃊 𝐋𝗲𝗮𝘃𝗲 𝖺 𝗹𝗶𝗸𝗲 𝖺𝗇\ud835"
3+
}

0 commit comments

Comments
 (0)