From d6831c2e83eef48be9518261f03982632fc54029 Mon Sep 17 00:00:00 2001 From: limit_yan Date: Sat, 13 Jun 2026 18:31:31 +0800 Subject: [PATCH] fix: eliminate retry storm on 429/TPM rate limits (issue #1120) --- .../adapters/ai-adapters/src/client/sse.rs | 18 +++- .../src/agentic/execution/round_executor.rs | 87 ++++++++++++++++++- src/web-ui/src/locales/en-US/errors.json | 2 +- src/web-ui/src/locales/zh-CN/errors.json | 2 +- src/web-ui/src/locales/zh-TW/errors.json | 2 +- 5 files changed, 106 insertions(+), 5 deletions(-) diff --git a/src/crates/adapters/ai-adapters/src/client/sse.rs b/src/crates/adapters/ai-adapters/src/client/sse.rs index d5094e00d..34f219acf 100644 --- a/src/crates/adapters/ai-adapters/src/client/sse.rs +++ b/src/crates/adapters/ai-adapters/src/client/sse.rs @@ -13,7 +13,15 @@ use std::time::Duration; use tokio::sync::mpsc; const BASE_RETRY_DELAY_MS: u64 = 500; -const MAX_RETRY_AFTER_DELAY_MS: u64 = 10_000; +/// Maximum delay applied to a `Retry-After` header value. +/// +/// Some providers (especially TPM-based rate limits on aggregator platforms +/// like NVIDIA's integrate API) return large `Retry-After` values of 30-60 +/// seconds. Capping at 10s caused tight retry loops that burned through the +/// user's request budget without actually waiting for the TPM window to reset. +/// 60s is a reasonable upper bound that respects provider guidance without +/// locking the user into an interminable stall. +const MAX_RETRY_AFTER_DELAY_MS: u64 = 60_000; enum StreamSendOutcome { Response(reqwest::Response), @@ -328,6 +336,14 @@ mod tests { ); } + #[test] + fn retry_after_preserves_sub_cap_values() { + let mut headers = HeaderMap::new(); + headers.insert(RETRY_AFTER, HeaderValue::from_static("45")); + + assert_eq!(retry_after_delay_ms(&headers), Some(45_000)); + } + #[test] fn retry_delay_falls_back_to_exponential_backoff() { let headers = HeaderMap::new(); diff --git a/src/crates/assembly/core/src/agentic/execution/round_executor.rs b/src/crates/assembly/core/src/agentic/execution/round_executor.rs index 9035b6b9c..0a08d564d 100644 --- a/src/crates/assembly/core/src/agentic/execution/round_executor.rs +++ b/src/crates/assembly/core/src/agentic/execution/round_executor.rs @@ -178,7 +178,21 @@ impl RoundExecutor { max_attempts, err_msg ))); } - return Err(BitFunError::AIClient(err_msg)); + // Non-transient errors (429 budget exhausted, context + // overflow, auth, etc.) are returned directly. The error + // message is classified downstream via + // `BitFunError::error_category()` into `ErrorCategory` for + // frontend recovery actions (wait_and_retry, switch_model, + // etc.). + let error = BitFunError::AIClient(err_msg); + warn!( + "AI request terminal failure: session_id={}, round_id={}, category={:?}, error={}", + context.session_id, + round_id, + error.error_category(), + error + ); + return Err(error); } }; @@ -1095,9 +1109,29 @@ impl RoundExecutor { Self::RETRY_BASE_DELAY_MS * (1u64 << attempt_index.min(3)) } + /// Check whether an error message represents a transient (retryable) condition. + /// + /// Errors that already exhausted the SSE-layer retry budget (e.g. "failed + /// after N attempts:" or "Stream retry budget exhausted") are **not** + /// transient from the round-executor perspective — the SSE transport layer + /// already retried with exponential backoff and `Retry-After` parsing. + /// Re-entering the send loop would multiply attempts (10 × 10 = 100) and + /// hold the user in a long silent stall. fn is_transient_network_error(error_message: &str) -> bool { let msg = error_message.to_lowercase(); + // The SSE layer already exhausted its own retry budget — do not + // re-enter another round of attempts from the round executor. + // We require BOTH "failed after " and "attempts:" to co-occur, + // which uniquely identifies the SSE/round-executor budget-exhausted + // format without catching generic errors like "failed after timeout". + if msg.contains("failed after ") && msg.contains("attempts:") { + return false; + } + if msg.contains("retry budget exhausted") { + return false; + } + let non_retryable_keywords = [ "invalid api key", "unauthorized", @@ -1471,4 +1505,55 @@ mod tests { assert!(trace.partial_recovery_reason.is_none()); assert_eq!(trace.error.as_deref(), Some("request failed")); } + + #[test] + fn is_transient_error_treats_rate_limit_as_transient() { + assert!(RoundExecutor::is_transient_network_error( + "OpenAI Streaming API error 429 Too Many Requests" + )); + assert!(RoundExecutor::is_transient_network_error( + "rate limit exceeded" + )); + } + + #[test] + fn is_transient_error_treats_network_errors_as_transient() { + assert!(RoundExecutor::is_transient_network_error( + "connection reset by peer" + )); + assert!(RoundExecutor::is_transient_network_error("timeout")); + } + + #[test] + fn is_transient_error_treats_context_overflow_as_non_transient() { + assert!(!RoundExecutor::is_transient_network_error( + "prompt is too long" + )); + } + + #[test] + fn is_transient_error_treats_budget_exhausted_as_non_transient() { + // After SSE layer exhausts its retry budget, the round executor must + // NOT re-enter another round of attempts (would cause 10×10 = 100 + // retries). + assert!(!RoundExecutor::is_transient_network_error( + "OpenAI Streaming API failed after 10 attempts: \ + OpenAI Streaming API error 429 Too Many Requests" + )); + assert!(!RoundExecutor::is_transient_network_error( + "Stream retry budget exhausted after 10 attempts: timeout" + )); + } + + #[test] + fn is_transient_error_does_not_misclassify_failed_after_without_attempts() { + // "failed after " without "attempts:" should NOT be treated as budget + // exhausted — it may be a legitimately retryable transient error. + assert!(RoundExecutor::is_transient_network_error( + "stream failed after connection reset" + )); + assert!(RoundExecutor::is_transient_network_error( + "request failed after timeout" + )); + } } diff --git a/src/web-ui/src/locales/en-US/errors.json b/src/web-ui/src/locales/en-US/errors.json index ee3bd6bb5..c7353fa44 100644 --- a/src/web-ui/src/locales/en-US/errors.json +++ b/src/web-ui/src/locales/en-US/errors.json @@ -62,7 +62,7 @@ "networkError": "AI response interrupted", "networkErrorSuggestion": "The connection was unstable or the model server closed it prematurely. Check your network and retry", "rateLimit": "Model rate limit exceeded", - "rateLimitSuggestion": "Please retry later, or switch to a different model in settings", + "rateLimitSuggestion": "The provider rejected this request due to rate limiting. This may be caused by too many requests in a short period, or by tokens-per-minute (TPM) limits on certain provider platforms. Wait a moment and retry, or switch to a different model in settings", "authError": "API authentication failed", "authErrorSuggestion": "The API key is invalid or expired. Check the key in your model configuration", "contextOverflow": "Conversation exceeds context limit", diff --git a/src/web-ui/src/locales/zh-CN/errors.json b/src/web-ui/src/locales/zh-CN/errors.json index a14c41d3f..58a6d708f 100644 --- a/src/web-ui/src/locales/zh-CN/errors.json +++ b/src/web-ui/src/locales/zh-CN/errors.json @@ -62,7 +62,7 @@ "networkError": "AI 响应中断", "networkErrorSuggestion": "网络连接不稳定或模型服务端提前关闭了连接,请检查网络后重试", "rateLimit": "模型请求频率超限", - "rateLimitSuggestion": "请稍后重试,或在模型设置中切换到其他模型", + "rateLimitSuggestion": "服务商因频率限制拒绝了本次请求。可能是短时间内请求过多,也可能是某些服务商平台存在每分钟 token 数(TPM)限制。请稍候重试,或在设置中切换到其他模型", "authError": "API 认证失败", "authErrorSuggestion": "API 密钥无效或已过期,请检查模型配置中的密钥设置", "contextOverflow": "对话内容超出限制", diff --git a/src/web-ui/src/locales/zh-TW/errors.json b/src/web-ui/src/locales/zh-TW/errors.json index 1437aa626..ff8edf318 100644 --- a/src/web-ui/src/locales/zh-TW/errors.json +++ b/src/web-ui/src/locales/zh-TW/errors.json @@ -62,7 +62,7 @@ "networkError": "AI 響應中斷", "networkErrorSuggestion": "網絡連接不穩定或模型服務端提前關閉了連接,請檢查網絡後重試", "rateLimit": "模型請求頻率超限", - "rateLimitSuggestion": "請稍後重試,或在模型設置中切換到其他模型", + "rateLimitSuggestion": "服務商因頻率限制拒絕了本次請求。可能是短時間內請求過多,也可能是某些服務商平台存在每分鐘 token 數(TPM)限制。請稍候重試,或在設定中切換到其他模型", "authError": "API 認證失敗", "authErrorSuggestion": "API 密鑰無效或已過期,請檢查模型設定中的密鑰設置", "contextOverflow": "對話內容超出限制",