From d6831c2e83eef48be9518261f03982632fc54029 Mon Sep 17 00:00:00 2001
From: limit_yan <limit_yan@sina.com>
Date: Sat, 13 Jun 2026 18:31:31 +0800
Subject: [PATCH] fix: eliminate retry storm on 429/TPM rate limits (issue
 #1120)

---
 .../adapters/ai-adapters/src/client/sse.rs    | 18 +++-
 .../src/agentic/execution/round_executor.rs   | 87 ++++++++++++++++++-
 src/web-ui/src/locales/en-US/errors.json      |  2 +-
 src/web-ui/src/locales/zh-CN/errors.json      |  2 +-
 src/web-ui/src/locales/zh-TW/errors.json      |  2 +-
 5 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/src/crates/adapters/ai-adapters/src/client/sse.rs b/src/crates/adapters/ai-adapters/src/client/sse.rs
index d5094e00d..34f219acf 100644
--- a/src/crates/adapters/ai-adapters/src/client/sse.rs
+++ b/src/crates/adapters/ai-adapters/src/client/sse.rs
@@ -13,7 +13,15 @@ use std::time::Duration;
 use tokio::sync::mpsc;
 
 const BASE_RETRY_DELAY_MS: u64 = 500;
-const MAX_RETRY_AFTER_DELAY_MS: u64 = 10_000;
+/// Maximum delay applied to a `Retry-After` header value.
+///
+/// Some providers (especially TPM-based rate limits on aggregator platforms
+/// like NVIDIA's integrate API) return large `Retry-After` values of 30-60
+/// seconds. Capping at 10s caused tight retry loops that burned through the
+/// user's request budget without actually waiting for the TPM window to reset.
+/// 60s is a reasonable upper bound that respects provider guidance without
+/// locking the user into an interminable stall.
+const MAX_RETRY_AFTER_DELAY_MS: u64 = 60_000;
 
 enum StreamSendOutcome {
     Response(reqwest::Response),
@@ -328,6 +336,14 @@ mod tests {
         );
     }
 
+    #[test]
+    fn retry_after_preserves_sub_cap_values() {
+        let mut headers = HeaderMap::new();
+        headers.insert(RETRY_AFTER, HeaderValue::from_static("45"));
+
+        assert_eq!(retry_after_delay_ms(&headers), Some(45_000));
+    }
+
     #[test]
     fn retry_delay_falls_back_to_exponential_backoff() {
         let headers = HeaderMap::new();
diff --git a/src/crates/assembly/core/src/agentic/execution/round_executor.rs b/src/crates/assembly/core/src/agentic/execution/round_executor.rs
index 9035b6b9c..0a08d564d 100644
--- a/src/crates/assembly/core/src/agentic/execution/round_executor.rs
+++ b/src/crates/assembly/core/src/agentic/execution/round_executor.rs
@@ -178,7 +178,21 @@ impl RoundExecutor {
                             max_attempts, err_msg
                         )));
                     }
-                    return Err(BitFunError::AIClient(err_msg));
+                    // Non-transient errors (429 budget exhausted, context
+                    // overflow, auth, etc.) are returned directly. The error
+                    // message is classified downstream via
+                    // `BitFunError::error_category()` into `ErrorCategory` for
+                    // frontend recovery actions (wait_and_retry, switch_model,
+                    // etc.).
+                    let error = BitFunError::AIClient(err_msg);
+                    warn!(
+                        "AI request terminal failure: session_id={}, round_id={}, category={:?}, error={}",
+                        context.session_id,
+                        round_id,
+                        error.error_category(),
+                        error
+                    );
+                    return Err(error);
                 }
             };
 
@@ -1095,9 +1109,29 @@ impl RoundExecutor {
         Self::RETRY_BASE_DELAY_MS * (1u64 << attempt_index.min(3))
     }
 
+    /// Check whether an error message represents a transient (retryable) condition.
+    ///
+    /// Errors that already exhausted the SSE-layer retry budget (e.g. "failed
+    /// after N attempts:" or "Stream retry budget exhausted") are **not**
+    /// transient from the round-executor perspective — the SSE transport layer
+    /// already retried with exponential backoff and `Retry-After` parsing.
+    /// Re-entering the send loop would multiply attempts (10 × 10 = 100) and
+    /// hold the user in a long silent stall.
     fn is_transient_network_error(error_message: &str) -> bool {
         let msg = error_message.to_lowercase();
 
+        // The SSE layer already exhausted its own retry budget — do not
+        // re-enter another round of attempts from the round executor.
+        // We require BOTH "failed after " and "attempts:" to co-occur,
+        // which uniquely identifies the SSE/round-executor budget-exhausted
+        // format without catching generic errors like "failed after timeout".
+        if msg.contains("failed after ") && msg.contains("attempts:") {
+            return false;
+        }
+        if msg.contains("retry budget exhausted") {
+            return false;
+        }
+
         let non_retryable_keywords = [
             "invalid api key",
             "unauthorized",
@@ -1471,4 +1505,55 @@ mod tests {
         assert!(trace.partial_recovery_reason.is_none());
         assert_eq!(trace.error.as_deref(), Some("request failed"));
     }
+
+    #[test]
+    fn is_transient_error_treats_rate_limit_as_transient() {
+        assert!(RoundExecutor::is_transient_network_error(
+            "OpenAI Streaming API error 429 Too Many Requests"
+        ));
+        assert!(RoundExecutor::is_transient_network_error(
+            "rate limit exceeded"
+        ));
+    }
+
+    #[test]
+    fn is_transient_error_treats_network_errors_as_transient() {
+        assert!(RoundExecutor::is_transient_network_error(
+            "connection reset by peer"
+        ));
+        assert!(RoundExecutor::is_transient_network_error("timeout"));
+    }
+
+    #[test]
+    fn is_transient_error_treats_context_overflow_as_non_transient() {
+        assert!(!RoundExecutor::is_transient_network_error(
+            "prompt is too long"
+        ));
+    }
+
+    #[test]
+    fn is_transient_error_treats_budget_exhausted_as_non_transient() {
+        // After SSE layer exhausts its retry budget, the round executor must
+        // NOT re-enter another round of attempts (would cause 10×10 = 100
+        // retries).
+        assert!(!RoundExecutor::is_transient_network_error(
+            "OpenAI Streaming API failed after 10 attempts: \
+             OpenAI Streaming API error 429 Too Many Requests"
+        ));
+        assert!(!RoundExecutor::is_transient_network_error(
+            "Stream retry budget exhausted after 10 attempts: timeout"
+        ));
+    }
+
+    #[test]
+    fn is_transient_error_does_not_misclassify_failed_after_without_attempts() {
+        // "failed after " without "attempts:" should NOT be treated as budget
+        // exhausted — it may be a legitimately retryable transient error.
+        assert!(RoundExecutor::is_transient_network_error(
+            "stream failed after connection reset"
+        ));
+        assert!(RoundExecutor::is_transient_network_error(
+            "request failed after timeout"
+        ));
+    }
 }
diff --git a/src/web-ui/src/locales/en-US/errors.json b/src/web-ui/src/locales/en-US/errors.json
index ee3bd6bb5..c7353fa44 100644
--- a/src/web-ui/src/locales/en-US/errors.json
+++ b/src/web-ui/src/locales/en-US/errors.json
@@ -62,7 +62,7 @@
     "networkError": "AI response interrupted",
     "networkErrorSuggestion": "The connection was unstable or the model server closed it prematurely. Check your network and retry",
     "rateLimit": "Model rate limit exceeded",
-    "rateLimitSuggestion": "Please retry later, or switch to a different model in settings",
+    "rateLimitSuggestion": "The provider rejected this request due to rate limiting. This may be caused by too many requests in a short period, or by tokens-per-minute (TPM) limits on certain provider platforms. Wait a moment and retry, or switch to a different model in settings",
     "authError": "API authentication failed",
     "authErrorSuggestion": "The API key is invalid or expired. Check the key in your model configuration",
     "contextOverflow": "Conversation exceeds context limit",
diff --git a/src/web-ui/src/locales/zh-CN/errors.json b/src/web-ui/src/locales/zh-CN/errors.json
index a14c41d3f..58a6d708f 100644
--- a/src/web-ui/src/locales/zh-CN/errors.json
+++ b/src/web-ui/src/locales/zh-CN/errors.json
@@ -62,7 +62,7 @@
     "networkError": "AI 响应中断",
     "networkErrorSuggestion": "网络连接不稳定或模型服务端提前关闭了连接，请检查网络后重试",
     "rateLimit": "模型请求频率超限",
-    "rateLimitSuggestion": "请稍后重试，或在模型设置中切换到其他模型",
+    "rateLimitSuggestion": "服务商因频率限制拒绝了本次请求。可能是短时间内请求过多，也可能是某些服务商平台存在每分钟 token 数（TPM）限制。请稍候重试，或在设置中切换到其他模型",
     "authError": "API 认证失败",
     "authErrorSuggestion": "API 密钥无效或已过期，请检查模型配置中的密钥设置",
     "contextOverflow": "对话内容超出限制",
diff --git a/src/web-ui/src/locales/zh-TW/errors.json b/src/web-ui/src/locales/zh-TW/errors.json
index 1437aa626..ff8edf318 100644
--- a/src/web-ui/src/locales/zh-TW/errors.json
+++ b/src/web-ui/src/locales/zh-TW/errors.json
@@ -62,7 +62,7 @@
     "networkError": "AI 響應中斷",
     "networkErrorSuggestion": "網絡連接不穩定或模型服務端提前關閉了連接，請檢查網絡後重試",
     "rateLimit": "模型請求頻率超限",
-    "rateLimitSuggestion": "請稍後重試，或在模型設置中切換到其他模型",
+    "rateLimitSuggestion": "服務商因頻率限制拒絕了本次請求。可能是短時間內請求過多，也可能是某些服務商平台存在每分鐘 token 數（TPM）限制。請稍候重試，或在設定中切換到其他模型",
     "authError": "API 認證失敗",
     "authErrorSuggestion": "API 密鑰無效或已過期，請檢查模型設定中的密鑰設置",
     "contextOverflow": "對話內容超出限制",