diff --git a/src/client/ollama.rs b/src/client/ollama.rs index 332fb2b..ba4a51b 100644 --- a/src/client/ollama.rs +++ b/src/client/ollama.rs @@ -92,10 +92,11 @@ impl OllamaClient { let status = response.status().as_u16(); let body = response.text().await.unwrap_or_default(); error!("Ollama chat failed with status {}: {}", status, body); - return Err(NodeTokenError::Ollama(format!( - "Chat failed: HTTP {} - {}", - status, body - ))); + // 用 HttpError 保留 status code, 让 executor 据此判断 is_client_error (B1 修复支撑) + return Err(NodeTokenError::HttpError { + status, + message: body, + }); } let chat_response: OllamaChatResponse = response.json().await.map_err(|e| { diff --git a/src/protocol/types.rs b/src/protocol/types.rs index 9b6a08c..47aa55a 100644 --- a/src/protocol/types.rs +++ b/src/protocol/types.rs @@ -202,6 +202,10 @@ pub enum NodeTaskResult { code: String, /// 错误消息 message: String, + /// 该失败是否由请求本身的问题引起(如模型不支持、参数非法)。 + /// 老 server 忽略该字段时仍按通常失败处理, 保留向后兼容。 + #[serde(default)] + is_client_error: bool, }, } @@ -709,6 +713,7 @@ mod tests { result: NodeTaskResult::Failed { code: "ollama_error".to_string(), message: "Model not found".to_string(), + is_client_error: false, }, }; let json = serde_json::to_string(&req).unwrap(); @@ -718,9 +723,14 @@ mod tests { assert_eq!(parsed.protocol_version, "node.v1"); assert_eq!(parsed.task_id, task_id); match parsed.result { - NodeTaskResult::Failed { code, message } => { + NodeTaskResult::Failed { + code, + message, + is_client_error, + } => { assert_eq!(code, "ollama_error"); assert_eq!(message, "Model not found"); + assert!(!is_client_error); } NodeTaskResult::Succeeded { .. } => panic!("Expected Failed variant"), } diff --git a/src/runtime/executor.rs b/src/runtime/executor.rs index 9834de3..3ee8c82 100644 --- a/src/runtime/executor.rs +++ b/src/runtime/executor.rs @@ -86,10 +86,7 @@ impl TaskExecutor { } Err(e) => { error!("Task {} execution failed: {}", task_id, e); - NodeTaskResult::Failed { - code: "ollama_error".to_string(), - message: e.to_string(), - } + classify_ollama_error(&e) } }; @@ -307,6 +304,32 @@ impl TaskExecutor { } } +/// 把 Ollama 调用错误分类成 NodeTaskResult, 携带 is_client_error 标志: +/// - Ollama 4xx (如 "model does not support chat") → is_client_error=true +/// 服务端会立即 terminal failed, 不计入 node failure_count +/// - Ollama 5xx / 网络错 → is_client_error=false (默认行为, requeue + 失败计数) +fn classify_ollama_error(err: &NodeTokenError) -> NodeTaskResult { + let (code, message, is_client_error) = match err { + NodeTokenError::HttpError { status, message } if (400..500).contains(status) => ( + format!("ollama_http_{}", status), + message.clone(), + true, + ), + NodeTokenError::HttpError { status, message } => ( + format!("ollama_http_{}", status), + message.clone(), + false, + ), + NodeTokenError::Network(e) => ("ollama_network".to_string(), e.to_string(), false), + other => ("ollama_error".to_string(), other.to_string(), false), + }; + NodeTaskResult::Failed { + code, + message, + is_client_error, + } +} + // 实现 Clone 以便在 tokio::spawn 中使用 impl Clone for TaskExecutor { fn clone(&self) -> Self { @@ -604,18 +627,59 @@ mod tests { let result = NodeTaskResult::Failed { code: "ollama_error".to_string(), message: error_msg.to_string(), + is_client_error: false, }; // 验证结果结构 match result { - NodeTaskResult::Failed { code, message } => { + NodeTaskResult::Failed { + code, + message, + is_client_error, + } => { assert_eq!(code, "ollama_error"); assert_eq!(message, error_msg); + assert!(!is_client_error); } NodeTaskResult::Succeeded { .. } => panic!("Expected Failed variant"), } } + /// B1 修复回归: HTTP 4xx 应被分类为 is_client_error=true + #[test] + fn test_classify_ollama_4xx_is_client_error() { + let err = NodeTokenError::HttpError { + status: 400, + message: "model does not support chat".to_string(), + }; + match classify_ollama_error(&err) { + NodeTaskResult::Failed { + is_client_error, + code, + .. + } => { + assert!(is_client_error, "4xx must mark is_client_error=true"); + assert_eq!(code, "ollama_http_400"); + } + _ => panic!("expected Failed"), + } + } + + /// HTTP 5xx 应是节点错, is_client_error=false (保持现有 retry 行为) + #[test] + fn test_classify_ollama_5xx_is_not_client_error() { + let err = NodeTokenError::HttpError { + status: 503, + message: "service unavailable".to_string(), + }; + match classify_ollama_error(&err) { + NodeTaskResult::Failed { + is_client_error, .. + } => assert!(!is_client_error), + _ => panic!("expected Failed"), + } + } + #[test] /// 验证任务 deadline 和 grace period 的计算逻辑 fn test_task_deadline_and_grace_period() { diff --git a/tests/integration_error_scenarios.rs b/tests/integration_error_scenarios.rs index 2664999..ebd8de3 100644 --- a/tests/integration_error_scenarios.rs +++ b/tests/integration_error_scenarios.rs @@ -109,7 +109,7 @@ async fn test_ollama_connection_failure() { // 验证连接失败 assert!(result.is_err(), "Ollama 连接应该失败"); - // 验证错误信息(可能是 Ollama 错误或 HTTP 错误) + // 验证错误信息(HttpError / Ollama / 状态码任一种描述都接受) let error_msg = format!("{:?}", result.unwrap_err()); assert!( error_msg.contains("connection") @@ -117,7 +117,9 @@ async fn test_ollama_connection_failure() { || error_msg.contains("Connection") || error_msg.contains("Ollama") || error_msg.contains("503") - || error_msg.contains("HTTP"), + || error_msg.contains("502") + || error_msg.contains("HTTP") + || error_msg.contains("HttpError"), "错误信息应该包含 Ollama 或连接失败描述: {}", error_msg );