From ae4869701848b63fc08c470f884a009124d32b19 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Wed, 6 May 2026 13:38:16 +0100 Subject: [PATCH 1/5] fix(cli): fail attempt on uncaught exception instead of hanging to maxDuration (TRI-9117) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a Node EventEmitter (e.g. node-redis) emits an "error" event with no listener attached, Node escalates it to process.on("uncaughtException") in the task worker. The worker reported the error via the UNCAUGHT_EXCEPTION IPC event but did not exit, and the supervisor-side handler in taskRunProcess only logged the message at debug level — leaving the run() promise orphaned until maxDuration fired and producing empty attempts (durationMs=0, costInCents=0). The supervisor now rejects the in-flight attempt with an UncaughtExceptionError and gracefully terminates the worker (preserving the OTEL flush window) on UNCAUGHT_EXCEPTION. The attempt fails fast with TASK_EXECUTION_FAILED, surfacing the original error name, message, and stack trace, and falls under the normal retry policy. This mirrors the existing indexing-side behavior in indexWorkerManifest. Apply the same handling to unhandled promise rejections, which Node already routes through uncaughtException by default. --- .changeset/uncaught-exception-fail-attempt.md | 9 +++++ .../src/executions/taskRunProcess.test.ts | 35 ++++++++++++++++- .../cli-v3/src/executions/taskRunProcess.ts | 39 +++++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 .changeset/uncaught-exception-fail-attempt.md diff --git a/.changeset/uncaught-exception-fail-attempt.md b/.changeset/uncaught-exception-fail-attempt.md new file mode 100644 index 00000000000..5cf1f3c58bb --- /dev/null +++ b/.changeset/uncaught-exception-fail-attempt.md @@ -0,0 +1,9 @@ +--- +"trigger.dev": patch +--- + +Fix runs hanging to `MAX_DURATION_EXCEEDED` after an uncaught exception. When a Node `EventEmitter` (e.g. `node-redis`) emits an `"error"` event with no listener attached, Node escalates it to `process.on("uncaughtException")` in the task worker. The worker reported the error via the `UNCAUGHT_EXCEPTION` IPC event but did not exit, and the supervisor-side handler in `taskRunProcess` only logged the message at debug level — leaving the `run()` promise orphaned until `maxDuration` fired and producing empty attempts (`durationMs=0`, `costInCents=0`). + +The supervisor now rejects the in-flight attempt with an `UncaughtExceptionError` and gracefully terminates the worker (preserving the OTEL flush window) on `UNCAUGHT_EXCEPTION`. The attempt fails fast with `TASK_EXECUTION_FAILED`, surfacing the original error name, message, and stack trace, and falls under the normal retry policy. This mirrors the existing indexing-side behavior. Apply the same handling to unhandled promise rejections, which Node already routes through `uncaughtException` by default. + +Customers should still attach `client.on("error", ...)` listeners to long-lived clients (Redis, Postgres, etc.) and let awaited command rejections drive failure semantics — but a missed listener will no longer silently consume the entire `maxDuration` budget. diff --git a/packages/cli-v3/src/executions/taskRunProcess.test.ts b/packages/cli-v3/src/executions/taskRunProcess.test.ts index 82ab19639b2..cfcd19516e9 100644 --- a/packages/cli-v3/src/executions/taskRunProcess.test.ts +++ b/packages/cli-v3/src/executions/taskRunProcess.test.ts @@ -1,6 +1,6 @@ import { TaskRunProcess, type TaskRunProcessOptions } from "./taskRunProcess.js"; import { describe, it, expect, vi } from "vitest"; -import { UnexpectedExitError } from "@trigger.dev/core/v3/errors"; +import { UncaughtExceptionError, UnexpectedExitError } from "@trigger.dev/core/v3/errors"; import type { TaskRunExecution, TaskRunExecutionPayload, @@ -118,4 +118,37 @@ describe("TaskRunProcess", () => { } }); }); + + describe("parseExecuteError(UncaughtExceptionError)", () => { + it("surfaces the original error name/message/stack as TASK_EXECUTION_FAILED", () => { + const error = new UncaughtExceptionError( + { + name: "Error", + message: "read ECONNRESET", + stack: + "Error: read ECONNRESET\n at TCP.onStreamRead (node:internal/stream_base_commons:216:20)", + }, + "uncaughtException" + ); + + const result = TaskRunProcess.parseExecuteError(error); + + expect(result.type).toBe("INTERNAL_ERROR"); + expect(result.code).toBe("TASK_EXECUTION_FAILED"); + expect(result.message).toBe("Uncaught uncaughtException: read ECONNRESET"); + expect(result.stackTrace).toContain("TCP.onStreamRead"); + }); + + it("preserves origin=unhandledRejection in the surfaced message", () => { + const error = new UncaughtExceptionError( + { name: "Error", message: "boom" }, + "unhandledRejection" + ); + + const result = TaskRunProcess.parseExecuteError(error); + + expect(result.code).toBe("TASK_EXECUTION_FAILED"); + expect(result.message).toBe("Uncaught unhandledRejection: boom"); + }); + }); }); diff --git a/packages/cli-v3/src/executions/taskRunProcess.ts b/packages/cli-v3/src/executions/taskRunProcess.ts index a329956c0d2..2014fd6db8c 100644 --- a/packages/cli-v3/src/executions/taskRunProcess.ts +++ b/packages/cli-v3/src/executions/taskRunProcess.ts @@ -33,6 +33,7 @@ import { MaxDurationExceededError, UnexpectedExitError, SuspendedProcessError, + UncaughtExceptionError, } from "@trigger.dev/core/v3/errors"; export type OnSendDebugLogMessage = InferSocketMessageSchema< @@ -205,6 +206,18 @@ export class TaskRunProcess { }, UNCAUGHT_EXCEPTION: async (message) => { logger.debug("uncaught exception in task run process", { ...message }); + + // The worker process reports uncaught exceptions and unhandled rejections via this + // event, but does not exit on its own. If we don't terminate the attempt here, run() + // hangs (the awaited promise that triggered the throw is orphaned) until maxDuration + // expires — surfacing as TIMED_OUT/MAX_DURATION_EXCEEDED with empty attempts. Reject + // any pending attempts now and gracefully terminate the worker so OTEL gets a flush + // window before SIGKILL. + this.#rejectPendingAttempts( + new UncaughtExceptionError(message.error, message.origin) + ); + + await this.#gracefullyTerminate(this.options.gracefulTerminationTimeoutInMs); }, SEND_DEBUG_LOG: async (message) => { this.onSendDebugLog.post(message); @@ -339,6 +352,23 @@ export class TaskRunProcess { logger.debug("child process error", { error, pid: this.pid }); } + #rejectPendingAttempts(error: Error) { + for (const [id, status] of this._attemptStatuses.entries()) { + if (status !== "PENDING") { + continue; + } + + this._attemptStatuses.set(id, "REJECTED"); + + const attemptPromise = this._attemptPromises.get(id); + if (!attemptPromise) { + continue; + } + + attemptPromise.rejecter(error); + } + } + async #handleExit(code: number | null, signal: NodeJS.Signals | null) { logger.debug("handling child exit", { code, signal, pid: this.pid }); @@ -559,6 +589,15 @@ export class TaskRunProcess { }; } + if (error instanceof UncaughtExceptionError) { + return { + type: "INTERNAL_ERROR", + code: TaskRunErrorCodes.TASK_EXECUTION_FAILED, + message: `Uncaught ${error.origin}: ${error.originalError.message}`, + stackTrace: error.originalError.stack, + }; + } + return { type: "INTERNAL_ERROR", code: TaskRunErrorCodes.TASK_EXECUTION_FAILED, From 413fa078a917cebf27e135876c8fa6e92a1f6520 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Wed, 6 May 2026 14:42:48 +0100 Subject: [PATCH 2/5] fix(cli): surface uncaught exception as user error (FAILED) not system failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following on from the prior commit that wired UNCAUGHT_EXCEPTION to fail the attempt: the parseExecuteError branch returned an INTERNAL_ERROR with code TASK_EXECUTION_FAILED, which made the run show as "System failure" in the dashboard. The exception was raised by user code (or a dependency the user controls — e.g. an EventEmitter "error" event with no listener), so it should surface as a regular task failure ("Failed" status), not as a platform fault. Widen parseExecuteError's return to TaskRunError and have the UncaughtExceptionError branch return a BUILT_IN_ERROR carrying the original error name, message, and stack. This routes through the same finalization path as a thrown user error: status=FAILED, normal retry policy, catchError / handleError hooks fire as expected. Both call sites (managed/execution.ts, dev-run-controller.ts) already pass the result into TaskRunFailedExecutionResult.error, which accepts the full TaskRunError union — no caller-side changes needed. --- .changeset/uncaught-exception-fail-attempt.md | 6 +---- .../src/executions/taskRunProcess.test.ts | 23 +++++++++++-------- .../cli-v3/src/executions/taskRunProcess.ts | 16 +++++++++---- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/.changeset/uncaught-exception-fail-attempt.md b/.changeset/uncaught-exception-fail-attempt.md index 5cf1f3c58bb..2ce80a3fa82 100644 --- a/.changeset/uncaught-exception-fail-attempt.md +++ b/.changeset/uncaught-exception-fail-attempt.md @@ -2,8 +2,4 @@ "trigger.dev": patch --- -Fix runs hanging to `MAX_DURATION_EXCEEDED` after an uncaught exception. When a Node `EventEmitter` (e.g. `node-redis`) emits an `"error"` event with no listener attached, Node escalates it to `process.on("uncaughtException")` in the task worker. The worker reported the error via the `UNCAUGHT_EXCEPTION` IPC event but did not exit, and the supervisor-side handler in `taskRunProcess` only logged the message at debug level — leaving the `run()` promise orphaned until `maxDuration` fired and producing empty attempts (`durationMs=0`, `costInCents=0`). - -The supervisor now rejects the in-flight attempt with an `UncaughtExceptionError` and gracefully terminates the worker (preserving the OTEL flush window) on `UNCAUGHT_EXCEPTION`. The attempt fails fast with `TASK_EXECUTION_FAILED`, surfacing the original error name, message, and stack trace, and falls under the normal retry policy. This mirrors the existing indexing-side behavior. Apply the same handling to unhandled promise rejections, which Node already routes through `uncaughtException` by default. - -Customers should still attach `client.on("error", ...)` listeners to long-lived clients (Redis, Postgres, etc.) and let awaited command rejections drive failure semantics — but a missed listener will no longer silently consume the entire `maxDuration` budget. +Fail attempts on uncaught exceptions instead of hanging to `MAX_DURATION_EXCEEDED`. A Node `EventEmitter` (e.g. `node-redis`) emitting `"error"` with no `.on("error", ...)` listener escalates to `uncaughtException`, which the worker previously reported but did not act on — runs drifted to maxDuration with empty attempts. They now fail fast with the original error and status `FAILED`. You should still attach `.on("error", ...)` listeners to long-lived clients to handle errors gracefully. diff --git a/packages/cli-v3/src/executions/taskRunProcess.test.ts b/packages/cli-v3/src/executions/taskRunProcess.test.ts index cfcd19516e9..b510f86cd15 100644 --- a/packages/cli-v3/src/executions/taskRunProcess.test.ts +++ b/packages/cli-v3/src/executions/taskRunProcess.test.ts @@ -120,7 +120,7 @@ describe("TaskRunProcess", () => { }); describe("parseExecuteError(UncaughtExceptionError)", () => { - it("surfaces the original error name/message/stack as TASK_EXECUTION_FAILED", () => { + it("surfaces the original error as a BUILT_IN_ERROR so the run shows as Failed, not System failure", () => { const error = new UncaughtExceptionError( { name: "Error", @@ -133,22 +133,27 @@ describe("TaskRunProcess", () => { const result = TaskRunProcess.parseExecuteError(error); - expect(result.type).toBe("INTERNAL_ERROR"); - expect(result.code).toBe("TASK_EXECUTION_FAILED"); - expect(result.message).toBe("Uncaught uncaughtException: read ECONNRESET"); - expect(result.stackTrace).toContain("TCP.onStreamRead"); + expect(result.type).toBe("BUILT_IN_ERROR"); + if (result.type === "BUILT_IN_ERROR") { + expect(result.name).toBe("Error"); + expect(result.message).toBe("read ECONNRESET"); + expect(result.stackTrace).toContain("TCP.onStreamRead"); + } }); - it("preserves origin=unhandledRejection in the surfaced message", () => { + it("preserves the original error for unhandledRejection origin too", () => { const error = new UncaughtExceptionError( - { name: "Error", message: "boom" }, + { name: "TypeError", message: "boom" }, "unhandledRejection" ); const result = TaskRunProcess.parseExecuteError(error); - expect(result.code).toBe("TASK_EXECUTION_FAILED"); - expect(result.message).toBe("Uncaught unhandledRejection: boom"); + expect(result.type).toBe("BUILT_IN_ERROR"); + if (result.type === "BUILT_IN_ERROR") { + expect(result.name).toBe("TypeError"); + expect(result.message).toBe("boom"); + } }); }); }); diff --git a/packages/cli-v3/src/executions/taskRunProcess.ts b/packages/cli-v3/src/executions/taskRunProcess.ts index 2014fd6db8c..a2317cbb021 100644 --- a/packages/cli-v3/src/executions/taskRunProcess.ts +++ b/packages/cli-v3/src/executions/taskRunProcess.ts @@ -8,6 +8,7 @@ import { TaskRunExecution, TaskRunExecutionPayload, TaskRunExecutionResult, + type TaskRunError, type TaskRunInternalError, tryCatch, WorkerManifest, @@ -555,7 +556,7 @@ export class TaskRunProcess { return this._child.connected; } - static parseExecuteError(error: unknown, dockerMode = true): TaskRunInternalError { + static parseExecuteError(error: unknown, dockerMode = true): TaskRunError { if (error instanceof CancelledProcessError) { return { type: "INTERNAL_ERROR", @@ -590,11 +591,16 @@ export class TaskRunProcess { } if (error instanceof UncaughtExceptionError) { + // Surface the customer's original error as a regular task failure (user + // error → "Failed" status) rather than an internal error → "System + // failure" status. The exception was raised by user code (or a + // dependency the user controls, e.g. an EventEmitter "error" event with + // no listener); it isn't a platform fault. return { - type: "INTERNAL_ERROR", - code: TaskRunErrorCodes.TASK_EXECUTION_FAILED, - message: `Uncaught ${error.origin}: ${error.originalError.message}`, - stackTrace: error.originalError.stack, + type: "BUILT_IN_ERROR", + name: error.originalError.name, + message: error.originalError.message, + stackTrace: error.originalError.stack ?? "", }; } From 3f70ec1d2f1de9cffc391f1022528688c3d5de87 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Wed, 6 May 2026 15:31:13 +0100 Subject: [PATCH 3/5] fix(core, cli, run-engine): route uncaught exceptions through new INTERNAL_ERROR code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce TASK_RUN_UNCAUGHT_EXCEPTION as a dedicated TaskRunInternalError code so the engine handles retry through its existing crash-style pathway (lockedRetryConfig lookup), and the dashboard renders these failures as "Failed" rather than "System failure". The previous BUILT_IN_ERROR approach showed the right status but didn't respect the user's retry policy: BUILT_IN_ERROR with retry: undefined short-circuits to fail_run because shouldLookupRetrySettings(BUILT_IN_ERROR) returns false. Inline retry calculation in cli-v3 was rejected as duplicating logic already owned by the engine. This change mirrors how TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE, TASK_PROCESS_SIGTERM, and TASK_PROCESS_SIGSEGV already work — same lookup-and-retry pathway, just with a different surface status (Failed vs Crashed) and the original error's message + stackTrace carried on the INTERNAL_ERROR payload. No global behaviour changes; the new code is opt-in via parseExecuteError's UncaughtExceptionError branch. Touchpoints: - packages/core/src/v3/schemas/common.ts: enum entry - packages/core/src/v3/errors.ts: shouldRetryError, shouldLookupRetrySettings - internal-packages/run-engine/src/engine/errors.ts: runStatusFromError - packages/cli-v3/src/executions/taskRunProcess.ts: parseExecuteError + revert TaskRunError widening - tests + changeset + server-changes entry --- .changeset/uncaught-exception-fail-attempt.md | 3 ++- .../uncaught-exception-status-mapping.md | 12 ++++++++++ .../run-engine/src/engine/errors.ts | 1 + .../src/executions/taskRunProcess.test.ts | 22 ++++++++----------- .../cli-v3/src/executions/taskRunProcess.ts | 20 ++++++++--------- packages/core/src/v3/errors.ts | 2 ++ packages/core/src/v3/schemas/common.ts | 1 + 7 files changed, 37 insertions(+), 24 deletions(-) create mode 100644 .server-changes/uncaught-exception-status-mapping.md diff --git a/.changeset/uncaught-exception-fail-attempt.md b/.changeset/uncaught-exception-fail-attempt.md index 2ce80a3fa82..d80c09c825e 100644 --- a/.changeset/uncaught-exception-fail-attempt.md +++ b/.changeset/uncaught-exception-fail-attempt.md @@ -1,5 +1,6 @@ --- "trigger.dev": patch +"@trigger.dev/core": patch --- -Fail attempts on uncaught exceptions instead of hanging to `MAX_DURATION_EXCEEDED`. A Node `EventEmitter` (e.g. `node-redis`) emitting `"error"` with no `.on("error", ...)` listener escalates to `uncaughtException`, which the worker previously reported but did not act on — runs drifted to maxDuration with empty attempts. They now fail fast with the original error and status `FAILED`. You should still attach `.on("error", ...)` listeners to long-lived clients to handle errors gracefully. +Fail attempts on uncaught exceptions instead of hanging to `MAX_DURATION_EXCEEDED`. A Node `EventEmitter` (e.g. `node-redis`) emitting `"error"` with no `.on("error", ...)` listener escalates to `uncaughtException`, which the worker previously reported but did not act on — runs drifted to maxDuration with empty attempts. They now fail fast with the original error and status `FAILED`, and respect the task's normal retry policy. You should still attach `.on("error", ...)` listeners to long-lived clients to handle errors gracefully. diff --git a/.server-changes/uncaught-exception-status-mapping.md b/.server-changes/uncaught-exception-status-mapping.md new file mode 100644 index 00000000000..941342359fb --- /dev/null +++ b/.server-changes/uncaught-exception-status-mapping.md @@ -0,0 +1,12 @@ +--- +area: run-engine +type: fix +--- + +Map the new `TASK_RUN_UNCAUGHT_EXCEPTION` internal-error code to +`COMPLETED_WITH_ERRORS` (Failed) status in `runStatusFromError`. cli-v3 +now emits this code when the worker process surfaces an uncaught +exception (e.g. a Node EventEmitter emitting `"error"` with no listener), +so the run renders as a regular task failure in the dashboard rather +than a system failure, while still routing through the engine's +`lockedRetryConfig` lookup so the user's retry policy is honoured. diff --git a/internal-packages/run-engine/src/engine/errors.ts b/internal-packages/run-engine/src/engine/errors.ts index 820f0ec4ce6..9a41cba11ee 100644 --- a/internal-packages/run-engine/src/engine/errors.ts +++ b/internal-packages/run-engine/src/engine/errors.ts @@ -19,6 +19,7 @@ export function runStatusFromError( case "TASK_INPUT_ERROR": case "TASK_OUTPUT_ERROR": case "TASK_MIDDLEWARE_ERROR": + case "TASK_RUN_UNCAUGHT_EXCEPTION": return "COMPLETED_WITH_ERRORS"; case "TASK_RUN_CANCELLED": return "CANCELED"; diff --git a/packages/cli-v3/src/executions/taskRunProcess.test.ts b/packages/cli-v3/src/executions/taskRunProcess.test.ts index b510f86cd15..9f36ac13b34 100644 --- a/packages/cli-v3/src/executions/taskRunProcess.test.ts +++ b/packages/cli-v3/src/executions/taskRunProcess.test.ts @@ -120,7 +120,7 @@ describe("TaskRunProcess", () => { }); describe("parseExecuteError(UncaughtExceptionError)", () => { - it("surfaces the original error as a BUILT_IN_ERROR so the run shows as Failed, not System failure", () => { + it("returns INTERNAL_ERROR with TASK_RUN_UNCAUGHT_EXCEPTION + original message and stack", () => { const error = new UncaughtExceptionError( { name: "Error", @@ -133,15 +133,13 @@ describe("TaskRunProcess", () => { const result = TaskRunProcess.parseExecuteError(error); - expect(result.type).toBe("BUILT_IN_ERROR"); - if (result.type === "BUILT_IN_ERROR") { - expect(result.name).toBe("Error"); - expect(result.message).toBe("read ECONNRESET"); - expect(result.stackTrace).toContain("TCP.onStreamRead"); - } + expect(result.type).toBe("INTERNAL_ERROR"); + expect(result.code).toBe("TASK_RUN_UNCAUGHT_EXCEPTION"); + expect(result.message).toBe("read ECONNRESET"); + expect(result.stackTrace).toContain("TCP.onStreamRead"); }); - it("preserves the original error for unhandledRejection origin too", () => { + it("uses the same code for unhandledRejection origin", () => { const error = new UncaughtExceptionError( { name: "TypeError", message: "boom" }, "unhandledRejection" @@ -149,11 +147,9 @@ describe("TaskRunProcess", () => { const result = TaskRunProcess.parseExecuteError(error); - expect(result.type).toBe("BUILT_IN_ERROR"); - if (result.type === "BUILT_IN_ERROR") { - expect(result.name).toBe("TypeError"); - expect(result.message).toBe("boom"); - } + expect(result.type).toBe("INTERNAL_ERROR"); + expect(result.code).toBe("TASK_RUN_UNCAUGHT_EXCEPTION"); + expect(result.message).toBe("boom"); }); }); }); diff --git a/packages/cli-v3/src/executions/taskRunProcess.ts b/packages/cli-v3/src/executions/taskRunProcess.ts index a2317cbb021..6816b2e24f2 100644 --- a/packages/cli-v3/src/executions/taskRunProcess.ts +++ b/packages/cli-v3/src/executions/taskRunProcess.ts @@ -8,7 +8,6 @@ import { TaskRunExecution, TaskRunExecutionPayload, TaskRunExecutionResult, - type TaskRunError, type TaskRunInternalError, tryCatch, WorkerManifest, @@ -556,7 +555,7 @@ export class TaskRunProcess { return this._child.connected; } - static parseExecuteError(error: unknown, dockerMode = true): TaskRunError { + static parseExecuteError(error: unknown, dockerMode = true): TaskRunInternalError { if (error instanceof CancelledProcessError) { return { type: "INTERNAL_ERROR", @@ -591,16 +590,17 @@ export class TaskRunProcess { } if (error instanceof UncaughtExceptionError) { - // Surface the customer's original error as a regular task failure (user - // error → "Failed" status) rather than an internal error → "System - // failure" status. The exception was raised by user code (or a - // dependency the user controls, e.g. an EventEmitter "error" event with - // no listener); it isn't a platform fault. + // Dedicated INTERNAL_ERROR code so the engine handles retry via the + // existing crash-style lookup of run.lockedRetryConfig (same pathway as + // TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE etc.) and so the dashboard + // renders this as "Failed" rather than "System failure" — the exception + // was raised by user code (or a dependency the user controls, e.g. an + // EventEmitter "error" event with no listener), not a platform fault. return { - type: "BUILT_IN_ERROR", - name: error.originalError.name, + type: "INTERNAL_ERROR", + code: TaskRunErrorCodes.TASK_RUN_UNCAUGHT_EXCEPTION, message: error.originalError.message, - stackTrace: error.originalError.stack ?? "", + stackTrace: error.originalError.stack, }; } diff --git a/packages/core/src/v3/errors.ts b/packages/core/src/v3/errors.ts index 802f53c5441..1c0a644b701 100644 --- a/packages/core/src/v3/errors.ts +++ b/packages/core/src/v3/errors.ts @@ -395,6 +395,7 @@ export function shouldRetryError(error: TaskRunError): boolean { case "TASK_EXECUTION_ABORTED": case "TASK_EXECUTION_FAILED": case "TASK_RUN_CRASHED": + case "TASK_RUN_UNCAUGHT_EXCEPTION": case "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE": case "TASK_PROCESS_SIGTERM": return true; @@ -425,6 +426,7 @@ export function shouldLookupRetrySettings(error: TaskRunError): boolean { case "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE": case "TASK_PROCESS_SIGTERM": case "TASK_PROCESS_SIGSEGV": + case "TASK_RUN_UNCAUGHT_EXCEPTION": return true; default: diff --git a/packages/core/src/v3/schemas/common.ts b/packages/core/src/v3/schemas/common.ts index 8bd22dd4bbb..4de2ddb5802 100644 --- a/packages/core/src/v3/schemas/common.ts +++ b/packages/core/src/v3/schemas/common.ts @@ -174,6 +174,7 @@ export const TaskRunInternalError = z.object({ "GRACEFUL_EXIT_TIMEOUT", "TASK_RUN_HEARTBEAT_TIMEOUT", "TASK_RUN_CRASHED", + "TASK_RUN_UNCAUGHT_EXCEPTION", "MAX_DURATION_EXCEEDED", "DISK_SPACE_EXCEEDED", "POD_EVICTED", From c672d3308e478eea2598477e3faf3b314fea4048 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Wed, 6 May 2026 16:08:28 +0100 Subject: [PATCH 4/5] fix(core): pretty link for TASK_RUN_UNCAUGHT_EXCEPTION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a prettyInternalErrors entry pointing the dashboard at a troubleshooting doc anchor for uncaught exceptions. Link-only — no `message` override, so the customer's original error (e.g. "read ECONNRESET") is preserved as the main text. The link gives them somewhere to read about attaching .on("error") listeners and the unhandledRejection pathway. The docs anchor (#uncaught-exceptions) doesn't exist yet — needs a docs PR to add the troubleshooting section. --- packages/core/src/v3/errors.ts | 12 ++++++++++++ packages/core/src/v3/links.ts | 1 + 2 files changed, 13 insertions(+) diff --git a/packages/core/src/v3/errors.ts b/packages/core/src/v3/errors.ts index 1c0a644b701..a538ca9357b 100644 --- a/packages/core/src/v3/errors.ts +++ b/packages/core/src/v3/errors.ts @@ -724,6 +724,18 @@ const prettyInternalErrors: Partial< href: links.docs.troubleshooting.stalledExecution, }, }, + // Link only — we deliberately do NOT set `message`, so the original + // error message (e.g. "read ECONNRESET") is preserved in the dashboard. + // Common cause: an EventEmitter (node-redis, pg, etc.) emitted "error" + // with no listener attached, which Node escalates to uncaughtException. + // The docs page explains how to attach .on("error") listeners and how + // unhandled rejections route through the same path. + TASK_RUN_UNCAUGHT_EXCEPTION: { + link: { + name: "Read our troubleshooting guide", + href: links.docs.troubleshooting.uncaughtException, + }, + }, }; const getPrettyTaskRunError = (code: TaskRunInternalError["code"]): TaskRunInternalError => { diff --git a/packages/core/src/v3/links.ts b/packages/core/src/v3/links.ts index d04284e73fe..739f9dd28f7 100644 --- a/packages/core/src/v3/links.ts +++ b/packages/core/src/v3/links.ts @@ -15,6 +15,7 @@ export const links = { troubleshooting: { concurrentWaits: "https://trigger.dev/docs/troubleshooting#parallel-waits-are-not-supported", stalledExecution: "https://trigger.dev/docs/troubleshooting#task-run-stalled-executing", + uncaughtException: "https://trigger.dev/docs/troubleshooting#uncaught-exceptions", }, concurrency: { recursiveDeadlock: From a5837fc7e6e158cc2f35f66dae359a523d1fb48f Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Wed, 6 May 2026 16:12:02 +0100 Subject: [PATCH 5/5] docs(troubleshooting): explain TASK_RUN_UNCAUGHT_EXCEPTION Adds an "Uncaught exceptions" section that the dashboard's pretty-link button now points at (#uncaught-exceptions). Covers what the error means, the common EventEmitter-without-listener cause (with a node-redis example), the .on("error", ...) fix, and the unhandledRejection path. --- docs/troubleshooting.mdx | 49 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/docs/troubleshooting.mdx b/docs/troubleshooting.mdx index 4bac1ba5b5c..71784c30edc 100644 --- a/docs/troubleshooting.mdx +++ b/docs/troubleshooting.mdx @@ -278,6 +278,55 @@ You could also offload the CPU-heavy work to a Node.js worker thread, but this i If the above doesn't work, then we recommend you try increasing the machine size of your task. See our [machines guide](/machines) for more information. +### Uncaught exceptions + +If you see a `TASK_RUN_UNCAUGHT_EXCEPTION` error, an exception escaped your task's `run()` function without being thrown through your `await` chain — the runtime caught it via Node's `process.on("uncaughtException")` handler. The dashboard surfaces this as a regular task failure (status `Failed`) and the run will retry according to your task's retry policy, but the exception still indicates a bug worth fixing. + +The most common cause is a Node `EventEmitter` emitting an `"error"` event with no listener attached. When this happens, Node escalates the event into an `uncaughtException`. Long-lived clients like `node-redis`, `pg`, `kafkajs`, and `mongodb` all surface socket-level errors this way. + +For example, a `node-redis` client with no error listener will fail your run with an `Error: read ECONNRESET` (or similar TCP error) the next time the socket is reset: + +```ts +import { task } from "@trigger.dev/sdk"; +import { createClient } from "redis"; + +export const myTask = task({ + id: "my-task", + run: async () => { + const client = createClient({ url: process.env.REDIS_URL }); + + // BAD: no .on("error", ...) listener — a socket reset will crash the run + // with an uncaught exception, even if the next .get() would have worked. + await client.connect(); + return await client.get("foo"); + }, +}); +``` + +Fix it by attaching an `error` listener so the event has somewhere to go: + +```ts +const client = createClient({ url: process.env.REDIS_URL }); + +// GOOD: the listener catches socket-level errors. The awaited command +// (e.g. .get) will still reject if the connection is broken, and that +// rejection propagates through your run() and fails the attempt cleanly. +client.on("error", (err) => { + logger.warn("Redis client error", { err }); +}); + +await client.connect(); +return await client.get("foo"); +``` + +The same fix applies to any library that emits `"error"` events. As a rule, attach an `.on("error", ...)` listener to every long-lived client you create inside a task. + + + +Unhandled promise rejections (e.g. `Promise.reject(...)` with no `.catch`) take the same path — Node routes them through `uncaughtException` by default, and the runtime treats them as `TASK_RUN_UNCAUGHT_EXCEPTION` for the same reasons. Make sure every promise either gets `await`ed or has a `.catch(...)` handler. + + + ### Realtime stream error (`sendBatchNonBlocking` / `S2AppendSession`) Errors mentioning `sendBatchNonBlocking`, `@s2-dev/streamstore`, or `S2AppendSession` (often with `code: undefined`) can occur when you close a stream and then await `waitUntilComplete()`, or when a stream runs for a long time (e.g. 20+ minutes). Wrap `waitUntilComplete()` in try/catch so Transport/closed-stream errors don't fail your task: