From 66740e77a50f6b8df111bfb7b49ec109e1a38ebf Mon Sep 17 00:00:00 2001 From: DC Date: Thu, 15 Jan 2026 22:03:44 -0500 Subject: [PATCH 1/3] fix(background-agent): prevent false failure notifications on network errors When API providers are under heavy load (e.g., Antigravity during the Anthropic OAuth migration), session.prompt() may throw network errors (timeouts, connection drops) even when the background session is still running successfully. Previously, ALL errors from session.prompt() would immediately mark the task as 'error' and notify the parent agent of failure. This caused agents to falsely believe their background tasks failed, triggering unnecessary fallback behavior ('Background agents failed. Pivoting to direct parallel searches'). Now, only fatal errors (agent not found, not registered, etc.) are treated as task failures. Non-fatal errors are logged and the task continues - polling will detect actual completion/failure. Fixes false-positive failure notifications during high API load. --- src/features/background-agent/manager.ts | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/features/background-agent/manager.ts b/src/features/background-agent/manager.ts index 9729acccc7..bb27a60365 100644 --- a/src/features/background-agent/manager.ts +++ b/src/features/background-agent/manager.ts @@ -186,16 +186,33 @@ export class BackgroundManager { parts: [{ type: "text", text: input.prompt }], }, }).catch((error) => { - log("[background-agent] promptAsync error:", error) + const errorMessage = error instanceof Error ? error.message : String(error) + log("[background-agent] prompt error:", { + sessionID, + agent: input.agent, + errorName: error?.name, + errorMessage, + }) + + // Only treat as fatal if it's clearly an agent/session creation issue. + // Network timeouts, connection drops, and SDK issues should NOT mark task as failed + // since the session may still be running successfully (common during high API load). + const isFatalError = + errorMessage.includes("agent.name") || + errorMessage.includes("undefined") || + errorMessage.includes("not found") || + errorMessage.includes("not registered") || + errorMessage.includes("does not exist") + + if (!isFatalError) { + log("[background-agent] Non-fatal prompt error, letting polling detect actual status:", sessionID) + return + } + const existingTask = this.findBySession(sessionID) if (existingTask) { existingTask.status = "error" - const errorMessage = error instanceof Error ? error.message : String(error) - if (errorMessage.includes("agent.name") || errorMessage.includes("undefined")) { - existingTask.error = `Agent "${input.agent}" not found. Make sure the agent is registered in your opencode.json or provided by a plugin.` - } else { - existingTask.error = errorMessage - } + existingTask.error = `Agent "${input.agent}" not found. Make sure the agent is registered in your opencode.json or provided by a plugin.` existingTask.completedAt = new Date() if (existingTask.concurrencyKey) { this.concurrencyManager.release(existingTask.concurrencyKey) From e243dd440deaa425e01d0b0698e86a556adae2bc Mon Sep 17 00:00:00 2001 From: David Carroll Date: Fri, 16 Jan 2026 10:49:31 -0500 Subject: [PATCH 2/3] fix(background-agent): invert error handling - fatal by default, transient only Address PR #837 feedback from cubic AI and codex: - INVERTED LOGIC: Treat errors as fatal by default instead of only catching specific 'not found' patterns. This fixes the P1 issue where auth/permission/payload errors were swallowed, leaving tasks stuck until TTL cleanup. - Only skip notification for known TRANSIENT network errors: - TimeoutError, AbortError, FetchError (error names) - ETIMEDOUT, ECONNRESET, ECONNREFUSED, ENOTFOUND (Node.js errors) - EAI_AGAIN, EPIPE, ENETUNREACH, EHOSTUNREACH (additional network) - socket hang up, network request failed, connection/request timed out - Empty error objects ({} or '') from SDK issues - Fixed resume() to also release concurrency on fatal errors (was missing) - Stricter agent-not-found detection: requires 'agent' AND ('not found' OR 'not registered' OR 'does not exist') - prevents false positives from generic 'undefined' errors - Removed overly broad patterns ('network', 'timeout') that could match non-transient errors like 'invalid timeout value' Ultraworked with Sisyphus Co-authored-by: Sisyphus --- src/features/background-agent/manager.ts | 92 +++++++++++++++++++----- 1 file changed, 76 insertions(+), 16 deletions(-) diff --git a/src/features/background-agent/manager.ts b/src/features/background-agent/manager.ts index bb27a60365..99eb182635 100644 --- a/src/features/background-agent/manager.ts +++ b/src/features/background-agent/manager.ts @@ -187,32 +187,61 @@ export class BackgroundManager { }, }).catch((error) => { const errorMessage = error instanceof Error ? error.message : String(error) + const errorName = error?.name ?? "" log("[background-agent] prompt error:", { sessionID, agent: input.agent, - errorName: error?.name, + errorName, errorMessage, }) - // Only treat as fatal if it's clearly an agent/session creation issue. - // Network timeouts, connection drops, and SDK issues should NOT mark task as failed - // since the session may still be running successfully (common during high API load). - const isFatalError = - errorMessage.includes("agent.name") || - errorMessage.includes("undefined") || - errorMessage.includes("not found") || - errorMessage.includes("not registered") || - errorMessage.includes("does not exist") + // INVERTED LOGIC: Treat errors as fatal by default. + // Only skip notification for known TRANSIENT network errors where + // the session may still be running successfully (common during high API load). + // + // Transient errors: network timeouts, connection drops, empty SDK errors + // These indicate the request failed to send, but the session may have started. + const isTransientNetworkError = + errorName === "TimeoutError" || + errorName === "AbortError" || + errorName === "FetchError" || + errorMessage.includes("ETIMEDOUT") || + errorMessage.includes("ECONNRESET") || + errorMessage.includes("ECONNREFUSED") || + errorMessage.includes("ENOTFOUND") || + errorMessage.includes("EAI_AGAIN") || + errorMessage.includes("EPIPE") || + errorMessage.includes("ENETUNREACH") || + errorMessage.includes("EHOSTUNREACH") || + errorMessage.includes("socket hang up") || + errorMessage.includes("network request failed") || + errorMessage.includes("connection timed out") || + errorMessage.includes("request timed out") || + // Empty error object {} from SDK issues - session likely still running + (errorMessage === "{}" || errorMessage === "") - if (!isFatalError) { - log("[background-agent] Non-fatal prompt error, letting polling detect actual status:", sessionID) + if (isTransientNetworkError) { + log("[background-agent] Transient network error, letting polling detect actual status:", sessionID) return } + // All other errors are fatal: auth, permission, invalid payload, agent not found, etc. + // Mark task as failed immediately to release concurrency and notify parent. const existingTask = this.findBySession(sessionID) if (existingTask) { existingTask.status = "error" - existingTask.error = `Agent "${input.agent}" not found. Make sure the agent is registered in your opencode.json or provided by a plugin.` + // Provide specific error message for agent-related issues (stricter matching) + const isAgentNotFoundError = + errorMessage.includes("agent") && ( + errorMessage.includes("not found") || + errorMessage.includes("not registered") || + errorMessage.includes("does not exist") + ) + if (isAgentNotFoundError) { + existingTask.error = `Agent "${input.agent}" not found. Make sure the agent is registered in your opencode.json or provided by a plugin.` + } else { + existingTask.error = `Background task failed: ${errorMessage}` + } existingTask.completedAt = new Date() if (existingTask.concurrencyKey) { this.concurrencyManager.release(existingTask.concurrencyKey) @@ -432,10 +461,41 @@ export class BackgroundManager { parts: [{ type: "text", text: input.prompt }], }, }).catch((error) => { - log("[background-agent] resume prompt error:", error) - existingTask.status = "error" const errorMessage = error instanceof Error ? error.message : String(error) - existingTask.error = errorMessage + const errorName = error?.name ?? "" + log("[background-agent] resume prompt error:", { + sessionID: existingTask.sessionID, + agent: existingTask.agent, + errorName, + errorMessage, + }) + + // Same transient network error handling as launch() + const isTransientNetworkError = + errorName === "TimeoutError" || + errorName === "AbortError" || + errorName === "FetchError" || + errorMessage.includes("ETIMEDOUT") || + errorMessage.includes("ECONNRESET") || + errorMessage.includes("ECONNREFUSED") || + errorMessage.includes("ENOTFOUND") || + errorMessage.includes("EAI_AGAIN") || + errorMessage.includes("EPIPE") || + errorMessage.includes("ENETUNREACH") || + errorMessage.includes("EHOSTUNREACH") || + errorMessage.includes("socket hang up") || + errorMessage.includes("network request failed") || + errorMessage.includes("connection timed out") || + errorMessage.includes("request timed out") || + (errorMessage === "{}" || errorMessage === "") + + if (isTransientNetworkError) { + log("[background-agent] Transient network error on resume, letting polling detect actual status:", existingTask.sessionID) + return + } + + existingTask.status = "error" + existingTask.error = `Resume failed: ${errorMessage}` existingTask.completedAt = new Date() // Release concurrency on error to prevent slot leaks From 86c1366ffd9438d2c6c93e71f44aa36285a959e0 Mon Sep 17 00:00:00 2001 From: David Carroll Date: Sat, 17 Jan 2026 00:58:28 -0500 Subject: [PATCH 3/3] fix(background-agent): handle String({}) returning [object Object] String(emptyErrorObject) returns '[object Object]', not '{}', so the transient error check was missing empty SDK error objects. This caused false failure notifications when the session was actually still running. Applied fix to both launch() and resume() error handlers. --- src/features/background-agent/manager.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/features/background-agent/manager.ts b/src/features/background-agent/manager.ts index 99eb182635..645691471c 100644 --- a/src/features/background-agent/manager.ts +++ b/src/features/background-agent/manager.ts @@ -218,7 +218,7 @@ export class BackgroundManager { errorMessage.includes("connection timed out") || errorMessage.includes("request timed out") || // Empty error object {} from SDK issues - session likely still running - (errorMessage === "{}" || errorMessage === "") + errorMessage === "{}" || errorMessage === "" || errorMessage === "[object Object]" if (isTransientNetworkError) { log("[background-agent] Transient network error, letting polling detect actual status:", sessionID) @@ -487,7 +487,7 @@ export class BackgroundManager { errorMessage.includes("network request failed") || errorMessage.includes("connection timed out") || errorMessage.includes("request timed out") || - (errorMessage === "{}" || errorMessage === "") + errorMessage === "{}" || errorMessage === "" || errorMessage === "[object Object]" if (isTransientNetworkError) { log("[background-agent] Transient network error on resume, letting polling detect actual status:", existingTask.sessionID)