diff --git a/packages/types/src/provider-settings.ts b/packages/types/src/provider-settings.ts index 3b3717ad45..cc74f9f43d 100644 --- a/packages/types/src/provider-settings.ts +++ b/packages/types/src/provider-settings.ts @@ -250,6 +250,7 @@ const openAiSchema = baseProviderSettingsSchema.extend({ openAiStreamingEnabled: z.boolean().optional(), openAiHostHeader: z.string().optional(), // Keep temporarily for backward compatibility during migration. openAiHeaders: z.record(z.string(), z.string()).optional(), + openAiStrictToolMessageOrdering: z.boolean().optional(), // For providers that don't allow user messages after tool messages. }) const ollamaSchema = baseProviderSettingsSchema.extend({ diff --git a/src/api/providers/openai.ts b/src/api/providers/openai.ts index 43a4fe7ae3..b6b5147739 100644 --- a/src/api/providers/openai.ts +++ b/src/api/providers/openai.ts @@ -102,11 +102,18 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl content: systemPrompt, } + // When strict tool message ordering is enabled, merge text content after tool_results + // into the last tool message instead of creating a separate user message. + // This is required for providers like NVIDIA NIM that don't allow user messages after tool messages. + const strictToolMessageOrdering = this.options.openAiStrictToolMessageOrdering ?? false + if (this.options.openAiStreamingEnabled ?? true) { let convertedMessages if (deepseekReasoner) { - convertedMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages]) + convertedMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages], { + mergeToolResultText: strictToolMessageOrdering, + }) } else { if (modelInfo.supportsPromptCache) { systemMessage = { @@ -122,7 +129,10 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl } } - convertedMessages = [systemMessage, ...convertToOpenAiMessages(messages)] + convertedMessages = [ + systemMessage, + ...convertToOpenAiMessages(messages, { mergeToolResultText: strictToolMessageOrdering }), + ] if (modelInfo.supportsPromptCache) { // Note: the following logic is copied from openrouter: @@ -227,8 +237,13 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = { model: modelId, messages: deepseekReasoner - ? convertToR1Format([{ role: "user", content: systemPrompt }, ...messages]) - : [systemMessage, ...convertToOpenAiMessages(messages)], + ? convertToR1Format([{ role: "user", content: systemPrompt }, ...messages], { + mergeToolResultText: strictToolMessageOrdering, + }) + : [ + systemMessage, + ...convertToOpenAiMessages(messages, { mergeToolResultText: strictToolMessageOrdering }), + ], ...(metadata?.tools && { tools: this.convertToolsForOpenAI(metadata.tools) }), ...(metadata?.tool_choice && { tool_choice: metadata.tool_choice }), ...(metadata?.toolProtocol === "native" && { @@ -338,6 +353,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl ): ApiStream { const modelInfo = this.getModel().info const methodIsAzureAiInference = this._isAzureAiInference(this.options.openAiBaseUrl) + const strictToolMessageOrdering = this.options.openAiStrictToolMessageOrdering ?? false if (this.options.openAiStreamingEnabled ?? true) { const isGrokXAI = this._isGrokXAI(this.options.openAiBaseUrl) @@ -349,7 +365,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl role: "developer", content: `Formatting re-enabled\n${systemPrompt}`, }, - ...convertToOpenAiMessages(messages), + ...convertToOpenAiMessages(messages, { mergeToolResultText: strictToolMessageOrdering }), ], stream: true, ...(isGrokXAI ? {} : { stream_options: { include_usage: true } }), @@ -386,7 +402,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl role: "developer", content: `Formatting re-enabled\n${systemPrompt}`, }, - ...convertToOpenAiMessages(messages), + ...convertToOpenAiMessages(messages, { mergeToolResultText: strictToolMessageOrdering }), ], reasoning_effort: modelInfo.reasoningEffort as "low" | "medium" | "high" | undefined, temperature: undefined, diff --git a/src/api/transform/__tests__/openai-format.spec.ts b/src/api/transform/__tests__/openai-format.spec.ts index d5d4404837..33996763ef 100644 --- a/src/api/transform/__tests__/openai-format.spec.ts +++ b/src/api/transform/__tests__/openai-format.spec.ts @@ -319,7 +319,7 @@ describe("convertToOpenAiMessages", () => { ) }) - it("should NOT merge text when images are present (fall back to user message)", () => { + it("should merge text and send images separately when mergeToolResultText is true", () => { const anthropicMessages: Anthropic.Messages.MessageParam[] = [ { role: "user", @@ -329,6 +329,10 @@ describe("convertToOpenAiMessages", () => { tool_use_id: "tool-123", content: "Tool result content", }, + { + type: "text", + text: "Context info", + }, { type: "image", source: { @@ -343,9 +347,57 @@ describe("convertToOpenAiMessages", () => { const openAiMessages = convertToOpenAiMessages(anthropicMessages, { mergeToolResultText: true }) - // Should produce a tool message AND a user message (because image is present) + // Should produce a tool message with merged text AND a user message for the image + expect(openAiMessages).toHaveLength(2) + + // First message: tool message with merged text + const toolMessage = openAiMessages[0] as OpenAI.Chat.ChatCompletionToolMessageParam + expect(toolMessage.role).toBe("tool") + expect(toolMessage.tool_call_id).toBe("tool-123") + expect(toolMessage.content).toBe( + "Tool result content\n\nContext info", + ) + + // Second message: user message with only the image + expect(openAiMessages[1].role).toBe("user") + const userContent = openAiMessages[1].content as Array<{ type: string; image_url?: { url: string } }> + expect(Array.isArray(userContent)).toBe(true) + expect(userContent).toHaveLength(1) + expect(userContent[0].type).toBe("image_url") + expect(userContent[0].image_url?.url).toBe("data:image/png;base64,base64data") + }) + + it("should send only images as user message when no text content exists with mergeToolResultText", () => { + const anthropicMessages: Anthropic.Messages.MessageParam[] = [ + { + role: "user", + content: [ + { + type: "tool_result", + tool_use_id: "tool-123", + content: "Tool result content", + }, + { + type: "image", + source: { + type: "base64", + media_type: "image/png", + data: "base64data", + }, + }, + ], + }, + ] + + const openAiMessages = convertToOpenAiMessages(anthropicMessages, { mergeToolResultText: true }) + + // Should produce a tool message AND a user message (only image, no text to merge) expect(openAiMessages).toHaveLength(2) expect((openAiMessages[0] as OpenAI.Chat.ChatCompletionToolMessageParam).role).toBe("tool") + // Tool message content should NOT be modified since there's no text to merge + expect((openAiMessages[0] as OpenAI.Chat.ChatCompletionToolMessageParam).content).toBe( + "Tool result content", + ) expect(openAiMessages[1].role).toBe("user") }) @@ -430,6 +482,122 @@ describe("convertToOpenAiMessages", () => { expect(openAiMessages).toHaveLength(1) expect(openAiMessages[0].role).toBe("user") }) + + it("should merge text into tool messages for multiple tool calls across conversation turns", () => { + // This test simulates a full conversation with multiple tool_result + environment_details messages + // to ensure mergeToolResultText works correctly for ALL tool_result messages, not just the first one + // Regression test for: "The fix works for the first message but after the first response the text content is NOT merged" + const anthropicMessages: Anthropic.Messages.MessageParam[] = [ + // Initial user message (no tool_results) + { + role: "user", + content: [ + { type: "text", text: "Create a file for me" }, + { type: "text", text: "Context 1" }, + ], + }, + // Assistant uses first tool + { + role: "assistant", + content: [ + { type: "text", text: "I'll create the file for you." }, + { + type: "tool_use", + id: "call_1", + name: "write_file", + input: { path: "test.txt", content: "hello" }, + }, + ], + }, + // First tool result + environment_details + { + role: "user", + content: [ + { type: "tool_result", tool_use_id: "call_1", content: "File created successfully" }, + { type: "text", text: "Context 2" }, + ], + }, + // Assistant uses second tool + { + role: "assistant", + content: [ + { type: "text", text: "Now I'll read the file to verify." }, + { type: "tool_use", id: "call_2", name: "read_file", input: { path: "test.txt" } }, + ], + }, + // Second tool result + environment_details (this is where the bug was reported) + { + role: "user", + content: [ + { type: "tool_result", tool_use_id: "call_2", content: "File content: hello" }, + { type: "text", text: "Context 3" }, + ], + }, + ] + + const openAiMessages = convertToOpenAiMessages(anthropicMessages, { mergeToolResultText: true }) + + // Expected structure: + // 1. User message (initial, no tool_results - text should remain as user message) + // 2. Assistant message with tool_calls + // 3. Tool message with merged text (first tool_result) + // 4. Assistant message with tool_calls + // 5. Tool message with merged text (second tool_result) + expect(openAiMessages).toHaveLength(5) + + // First message should be a user message (no tool_results to merge into) + expect(openAiMessages[0].role).toBe("user") + + // Second message should be assistant with tool_calls + expect(openAiMessages[1].role).toBe("assistant") + expect((openAiMessages[1] as OpenAI.Chat.ChatCompletionAssistantMessageParam).tool_calls).toHaveLength(1) + + // Third message should be tool message with merged environment_details + const firstToolMsg = openAiMessages[2] as OpenAI.Chat.ChatCompletionToolMessageParam + expect(firstToolMsg.role).toBe("tool") + expect(firstToolMsg.tool_call_id).toBe("call_1") + expect(firstToolMsg.content).toContain("File created successfully") + expect(firstToolMsg.content).toContain("Context 2") + + // Fourth message should be assistant with tool_calls + expect(openAiMessages[3].role).toBe("assistant") + expect((openAiMessages[3] as OpenAI.Chat.ChatCompletionAssistantMessageParam).tool_calls).toHaveLength(1) + + // Fifth message should be tool message with merged environment_details (THE BUG FIX) + const secondToolMsg = openAiMessages[4] as OpenAI.Chat.ChatCompletionToolMessageParam + expect(secondToolMsg.role).toBe("tool") + expect(secondToolMsg.tool_call_id).toBe("call_2") + expect(secondToolMsg.content).toContain("File content: hello") + expect(secondToolMsg.content).toContain("Context 3") + }) + + it("should NOT create user messages after tool messages when mergeToolResultText is true", () => { + // This test specifically verifies that the "user after tool" error is avoided + const anthropicMessages: Anthropic.Messages.MessageParam[] = [ + { + role: "assistant", + content: [{ type: "tool_use", id: "tool_1", name: "read_file", input: { path: "test.ts" } }], + }, + { + role: "user", + content: [ + { type: "tool_result", tool_use_id: "tool_1", content: "File contents" }, + { type: "text", text: "Some context" }, + ], + }, + ] + + const openAiMessages = convertToOpenAiMessages(anthropicMessages, { mergeToolResultText: true }) + + // Should produce assistant + tool (no user message) + expect(openAiMessages).toHaveLength(2) + expect(openAiMessages[0].role).toBe("assistant") + expect(openAiMessages[1].role).toBe("tool") + // The text should be merged into the tool message, NOT as a separate user message + expect((openAiMessages[1] as OpenAI.Chat.ChatCompletionToolMessageParam).content).toContain( + "Some context", + ) + }) }) describe("reasoning_details transformation", () => { diff --git a/src/api/transform/openai-format.ts b/src/api/transform/openai-format.ts index ad02be5541..f182a2aac1 100644 --- a/src/api/transform/openai-format.ts +++ b/src/api/transform/openai-format.ts @@ -138,24 +138,40 @@ export function convertToOpenAiMessages( // Process non-tool messages if (nonToolMessages.length > 0) { - // Check if we should merge text into the last tool message - // This is critical for reasoning/thinking models where a user message - // after tool results causes the model to drop all previous reasoning_content - const hasOnlyTextContent = nonToolMessages.every((part) => part.type === "text") const hasToolMessages = toolMessages.length > 0 - const shouldMergeIntoToolMessage = - options?.mergeToolResultText && hasToolMessages && hasOnlyTextContent - if (shouldMergeIntoToolMessage) { + if (options?.mergeToolResultText && hasToolMessages) { + // When mergeToolResultText is enabled, separate text and images + // Merge text into the last tool message, and send images separately + // This is critical for providers like NVIDIA NIM that don't allow user messages after tool messages + const textMessages = nonToolMessages.filter( + (part) => part.type === "text", + ) as Anthropic.TextBlockParam[] + const imageMessages = nonToolMessages.filter( + (part) => part.type === "image", + ) as Anthropic.ImageBlockParam[] + // Merge text content into the last tool message - const lastToolMessage = openAiMessages[ - openAiMessages.length - 1 - ] as OpenAI.Chat.ChatCompletionToolMessageParam - if (lastToolMessage?.role === "tool") { - const additionalText = nonToolMessages - .map((part) => (part as Anthropic.TextBlockParam).text) - .join("\n") - lastToolMessage.content = `${lastToolMessage.content}\n\n${additionalText}` + if (textMessages.length > 0) { + const lastToolMessage = openAiMessages[ + openAiMessages.length - 1 + ] as OpenAI.Chat.ChatCompletionToolMessageParam + if (lastToolMessage?.role === "tool") { + const additionalText = textMessages.map((part) => part.text).join("\n") + lastToolMessage.content = `${lastToolMessage.content}\n\n${additionalText}` + } + } + + // Send images as a separate user message if any + // Note: Images must still be sent as user messages since tool messages don't support images + if (imageMessages.length > 0) { + openAiMessages.push({ + role: "user", + content: imageMessages.map((part) => ({ + type: "image_url", + image_url: { url: `data:${part.source.media_type};base64,${part.source.data}` }, + })), + }) } } else { // Standard behavior: add user message with text/image content diff --git a/webview-ui/src/components/settings/providers/OpenAICompatible.tsx b/webview-ui/src/components/settings/providers/OpenAICompatible.tsx index 4eea6f09f1..cc1777ca76 100644 --- a/webview-ui/src/components/settings/providers/OpenAICompatible.tsx +++ b/webview-ui/src/components/settings/providers/OpenAICompatible.tsx @@ -158,6 +158,16 @@ export const OpenAICompatible = ({ onChange={handleInputChange("openAiStreamingEnabled", noTransform)}> {t("settings:modelInfo.enableStreaming")} +
+ + {t("settings:providers.openAiStrictToolMessageOrdering.label")} + +
+ {t("settings:providers.openAiStrictToolMessageOrdering.description")} +
+