From 2da8c136a28c77d8960fd9f652599f8245967b66 Mon Sep 17 00:00:00 2001 From: Evan Mattson Date: Tue, 3 Feb 2026 11:38:19 +0900 Subject: [PATCH 1/6] AG-UI bug fixes --- .../_message_adapters.py | 16 +- .../ag-ui/agent_framework_ag_ui/_run.py | 115 ++++++++++- .../ag-ui/tests/test_message_hygiene.py | 135 ++++++++++++ python/packages/ag-ui/tests/test_run.py | 192 ++++++++++++++++++ 4 files changed, 442 insertions(+), 16 deletions(-) diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py index f8f1623a30..cf189b0a4d 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py +++ b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py @@ -67,7 +67,7 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]: if approval_call_ids and pending_tool_call_ids: pending_tool_call_ids -= approval_call_ids logger.info( - f"FunctionApprovalResponseContent found for call_ids={sorted(approval_call_ids)} - " + f"function_approval_response content found for call_ids={sorted(approval_call_ids)} - " "framework will handle execution" ) @@ -150,6 +150,10 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]: call_id = str(content.call_id) if call_id in pending_tool_call_ids: keep = True + # Remove the call_id from pending since we now have its result. + # This prevents duplicate synthetic "skipped" results from being + # injected when a user message arrives later. + pending_tool_call_ids.discard(call_id) if call_id == pending_confirm_changes_id: pending_confirm_changes_id = None break @@ -338,7 +342,7 @@ def _filter_modified_args( result: list[ChatMessage] = [] for msg in messages: # Handle standard tool result messages early (role="tool") to preserve provider invariants - # This path maps AG‑UI tool messages to FunctionResultContent with the correct tool_call_id + # This path maps AG‑UI tool messages to function_result content with the correct tool_call_id role_str = normalize_agui_role(msg.get("role", "user")) if role_str == "tool": # Prefer explicit tool_call_id fields; fall back to backend fields only if necessary @@ -371,7 +375,7 @@ def _filter_modified_args( if is_approval: # Look for the matching function call in previous messages to create - # a proper FunctionApprovalResponseContent. This enables the agent framework + # proper function_approval_response content. This enables the agent framework # to execute the approved tool (fix for GitHub issue #3034). accepted = parsed.get("accepted", False) if parsed is not None else False approval_payload_text = result_content if isinstance(result_content, str) else json.dumps(parsed) @@ -465,7 +469,7 @@ def _filter_modified_args( # No modified arguments - use the original function call func_call_for_approval = matching_func_call - # Create FunctionApprovalResponseContent for the agent framework + # Create function_approval_response content for the agent framework approval_response = Content.from_function_approval_response( approved=accepted, id=str(approval_call_id), @@ -489,7 +493,7 @@ def _filter_modified_args( result.append(chat_msg) continue - # Cast result_content to acceptable type for FunctionResultContent + # Cast result_content to acceptable type for function_result content func_result: str | dict[str, Any] | list[Any] if isinstance(result_content, str): func_result = result_content @@ -566,7 +570,7 @@ def _filter_modified_args( # Check if this message contains function approvals if "function_approvals" in msg and msg["function_approvals"]: - # Convert function approvals to FunctionApprovalResponseContent + # Convert function approvals to function_approval_response content approval_contents: list[Any] = [] for approval in msg["function_approvals"]: # Create FunctionCallContent with the modified arguments diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_run.py b/python/packages/ag-ui/agent_framework_ag_ui/_run.py index d1229620a7..e9b7554e17 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui/_run.py +++ b/python/packages/ag-ui/agent_framework_ag_ui/_run.py @@ -45,6 +45,7 @@ convert_agui_tools_to_agent_framework, generate_event_id, get_conversation_id_from_update, + get_role_value, make_json_safe, ) @@ -344,7 +345,7 @@ def _emit_tool_result( flow: FlowState, predictive_handler: PredictiveStateHandler | None = None, ) -> list[BaseEvent]: - """Emit ToolCallResult events for FunctionResultContent.""" + """Emit ToolCallResult events for function_result content.""" events: list[BaseEvent] = [] # Cannot emit tool result without a call_id to associate it with @@ -385,6 +386,12 @@ def _emit_tool_result( # After tool result, any subsequent text should start a new message flow.tool_call_id = None flow.tool_call_name = None + + # Close any open text message before resetting message_id (issue #3568) + # This handles the case where a TextMessageStartEvent was emitted for tool-only + # messages (Feature #4) but needs to be closed before starting a new message + if flow.message_id: + events.append(TextMessageEndEvent(message_id=flow.message_id)) flow.message_id = None # Reset so next text content starts a new message return events @@ -558,8 +565,8 @@ async def _resolve_approval_responses( ) -> None: """Execute approved function calls and replace approval content with results. - This modifies the messages list in place, replacing FunctionApprovalResponseContent - with FunctionResultContent containing the actual tool execution result. + This modifies the messages list in place, replacing function_approval_response + content with function_result content containing the actual tool execution result. Args: messages: List of messages (will be modified in place) @@ -622,6 +629,76 @@ async def _resolve_approval_responses( _replace_approval_contents_with_results(messages, fcc_todo, normalized_results) # type: ignore + # Post-process: Convert user messages with function_result content to proper tool messages. + # After _replace_approval_contents_with_results, approved tool calls have their results + # placed in user messages. OpenAI requires tool results to be in role="tool" messages. + # This transformation ensures the message history is valid for the LLM provider. + _convert_approval_results_to_tool_messages(messages) + + +def _convert_approval_results_to_tool_messages(messages: list[Any]) -> None: + """Convert function_result content in user messages to proper tool messages. + + After approval processing, tool results end up in user messages. OpenAI and other + providers require tool results to be in role="tool" messages. This function + extracts function_result content from user messages and creates proper tool messages. + + This modifies the messages list in place. + + Args: + messages: List of ChatMessage objects to process + """ + i = 0 + while i < len(messages): + msg = messages[i] + role_value = get_role_value(msg) + + if role_value != "user": + i += 1 + continue + + # Check if this user message has function_result content + function_results: list[Content] = [] + other_contents: list[Any] = [] + + for content in msg.contents or []: + if getattr(content, "type", None) == "function_result": + function_results.append(content) + else: + other_contents.append(content) + + if not function_results: + i += 1 + continue + + # We have function results in a user message - need to fix this + logger.info( + f"Converting {len(function_results)} function_result content(s) from user message to tool message(s)" + ) + + # Create tool messages for each function result + new_tool_messages = [] + for func_result in function_results: + tool_msg = ChatMessage( + role="tool", + contents=[func_result], + ) + new_tool_messages.append(tool_msg) + + if other_contents: + # Keep the user message with remaining contents + msg.contents = other_contents + # Insert tool messages after this user message + for j, tool_msg in enumerate(new_tool_messages): + messages.insert(i + 1 + j, tool_msg) + i += 1 + len(new_tool_messages) + else: + # No other contents - replace user message with tool messages + messages.pop(i) + for j, tool_msg in enumerate(new_tool_messages): + messages.insert(i + j, tool_msg) + i += len(new_tool_messages) + def _build_messages_snapshot( flow: FlowState, @@ -630,25 +707,29 @@ def _build_messages_snapshot( """Build MessagesSnapshotEvent from current flow state.""" all_messages = list(snapshot_messages) - # Add assistant message with tool calls + # Add assistant message with tool calls only (no content) if flow.pending_tool_calls: tool_call_message = { "id": flow.message_id or generate_event_id(), "role": "assistant", "tool_calls": flow.pending_tool_calls.copy(), } - if flow.accumulated_text: - tool_call_message["content"] = flow.accumulated_text all_messages.append(tool_call_message) # Add tool results all_messages.extend(flow.tool_results) - # Add text-only assistant message if no tool calls - if flow.accumulated_text and not flow.pending_tool_calls: + # Add text-only assistant message if there is accumulated text + # This is a separate message from the tool calls message to maintain + # the expected AG-UI protocol format (see issue #3619) + if flow.accumulated_text: + # Use a new ID for the content message if we had tool calls (separate message) + content_message_id = ( + generate_event_id() if flow.pending_tool_calls else (flow.message_id or generate_event_id()) + ) all_messages.append( { - "id": flow.message_id or generate_event_id(), + "id": content_message_id, "role": "assistant", "content": flow.accumulated_text, } @@ -922,6 +1003,20 @@ async def run_agent_stream( tool_call_id, ) + # Parse function arguments - skip confirm_changes if we can't parse + # (we can't ask user to confirm something we can't properly display) + try: + function_arguments = json.loads(tool_call.get("function", {}).get("arguments", "{}")) + except json.JSONDecodeError: + logger.warning( + "Failed to decode JSON arguments for confirm_changes tool '%s' " + "(tool_call_id=%s). Skipping confirmation flow - cannot display " + "malformed arguments to user for approval.", + tool_name, + tool_call_id, + ) + continue # Skip to next tool call without emitting confirm_changes + # Emit confirm_changes tool call confirm_id = generate_event_id() yield ToolCallStartEvent( @@ -932,7 +1027,7 @@ async def run_agent_stream( confirm_args = { "function_name": tool_name, "function_call_id": tool_call_id, - "function_arguments": json.loads(tool_call.get("function", {}).get("arguments", "{}")), + "function_arguments": function_arguments, "steps": [{"description": f"Execute {tool_name}", "status": "enabled"}], } yield ToolCallArgsEvent(tool_call_id=confirm_id, delta=json.dumps(confirm_args)) diff --git a/python/packages/ag-ui/tests/test_message_hygiene.py b/python/packages/ag-ui/tests/test_message_hygiene.py index ecc01de3cb..eb55965ab9 100644 --- a/python/packages/ag-ui/tests/test_message_hygiene.py +++ b/python/packages/ag-ui/tests/test_message_hygiene.py @@ -48,3 +48,138 @@ def test_deduplicate_messages_prefers_non_empty_tool_results() -> None: deduped = _deduplicate_messages(messages) assert len(deduped) == 1 assert deduped[0].contents[0].result == "result data" + + +def test_convert_approval_results_to_tool_messages() -> None: + """Test that function_result content in user messages gets converted to tool messages. + + This is a regression test for the MCP tool double-call bug where approved tool + results ended up in user messages instead of tool messages, causing OpenAI to + reject the request with 'tool_call_ids did not have response messages'. + """ + from agent_framework_ag_ui._run import _convert_approval_results_to_tool_messages + + # Simulate what happens after _resolve_approval_responses: + # A user message contains function_result content (the executed tool result) + messages = [ + ChatMessage( + role="assistant", + contents=[ + Content.from_function_call(call_id="call_123", name="my_mcp_tool", arguments="{}"), + ], + ), + ChatMessage( + role="user", + contents=[ + Content.from_function_result(call_id="call_123", result="tool execution result"), + ], + ), + ] + + _convert_approval_results_to_tool_messages(messages) + + # After conversion, the function result should be in a tool message, not user message + assert len(messages) == 2 + + # First message unchanged + assert messages[0].role.value == "assistant" + + # Second message should now be role="tool" + assert messages[1].role.value == "tool" + assert messages[1].contents[0].type == "function_result" + assert messages[1].contents[0].call_id == "call_123" + + +def test_convert_approval_results_preserves_other_user_content() -> None: + """Test that user messages with mixed content are handled correctly. + + If a user message has both function_result content and other content (like text), + the function_result content should be extracted to a tool message while the + remaining content stays in the user message. + """ + from agent_framework_ag_ui._run import _convert_approval_results_to_tool_messages + + messages = [ + ChatMessage( + role="assistant", + contents=[ + Content.from_function_call(call_id="call_123", name="my_tool", arguments="{}"), + ], + ), + ChatMessage( + role="user", + contents=[ + Content.from_text(text="User also said something"), + Content.from_function_result(call_id="call_123", result="tool result"), + ], + ), + ] + + _convert_approval_results_to_tool_messages(messages) + + # Should have 3 messages now: assistant, user (with text), tool (with result) + assert len(messages) == 3 + + # First message unchanged + assert messages[0].role.value == "assistant" + + # Second message should be user with just text + assert messages[1].role.value == "user" + assert len(messages[1].contents) == 1 + assert messages[1].contents[0].type == "text" + + # Third message should be tool with result + assert messages[2].role.value == "tool" + assert messages[2].contents[0].type == "function_result" + + +def test_sanitize_tool_history_multiple_requests_with_confirm_changes() -> None: + """Test that confirm_changes is properly handled across multiple requests. + + This is a regression test for the MCP tool double-call bug. When a second + MCP tool call happens after the first one completed, the message history + should have proper synthetic results for all confirm_changes calls. + """ + # Simulate history from first request (already completed) + # Note: confirm_changes synthetic result might be missing from frontend's snapshot + messages = [ + # First request - user asks something + ChatMessage( + role="user", + contents=[Content.from_text(text="What time is it?")], + ), + # First request - assistant calls MCP tool + confirm_changes + ChatMessage( + role="assistant", + contents=[ + Content.from_function_call(call_id="call_1", name="get_datetime", arguments="{}"), + Content.from_function_call(call_id="call_c1", name="confirm_changes", arguments="{}"), + ], + ), + # First request - tool result for the actual MCP tool + ChatMessage( + role="tool", + contents=[Content.from_function_result(call_id="call_1", result="2024-01-01 12:00:00")], + ), + # Note: NO tool result for call_c1 (confirm_changes) - this is the bug scenario! + # The synthetic was injected in request 1 but not persisted in snapshot + # Second request - user asks something else + ChatMessage( + role="user", + contents=[Content.from_text(text="What's the date?")], + ), + ] + + sanitized = _sanitize_tool_history(messages) + + # After sanitization, call_c1 should have a synthetic result + tool_messages = [ + msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "tool" + ] + + # Should have 2 tool messages: one for call_1 (real) and one for call_c1 (synthetic) + assert len(tool_messages) == 2 + + tool_call_ids = {str(msg.contents[0].call_id) for msg in tool_messages} + assert "call_1" in tool_call_ids + assert "call_c1" in tool_call_ids diff --git a/python/packages/ag-ui/tests/test_run.py b/python/packages/ag-ui/tests/test_run.py index a415000692..5cfcfe7619 100644 --- a/python/packages/ag-ui/tests/test_run.py +++ b/python/packages/ag-ui/tests/test_run.py @@ -353,6 +353,58 @@ def test_emit_tool_call_generates_id(): assert flow.tool_call_id is not None # ID should be generated +def test_emit_tool_result_closes_open_message(): + """Test _emit_tool_result emits TextMessageEndEvent for open text message. + + This is a regression test for issue #3568 where TEXT_MESSAGE_END was not + emitted when using MCP tools because the message_id was reset without + closing the message first. + """ + from ag_ui.core import TextMessageEndEvent + + from agent_framework_ag_ui._run import FlowState, _emit_tool_result + + flow = FlowState() + # Simulate an open text message (e.g., from Feature #4 tool-only detection) + flow.message_id = "open-msg-123" + flow.tool_call_id = "call_456" + + content = Content.from_function_result(call_id="call_456", result="tool result") + + events = _emit_tool_result(content, flow, predictive_handler=None) + + # Should have: ToolCallEndEvent, ToolCallResultEvent, TextMessageEndEvent + assert len(events) == 3 + + # Verify TextMessageEndEvent is emitted for the open message + text_end_events = [e for e in events if isinstance(e, TextMessageEndEvent)] + assert len(text_end_events) == 1 + assert text_end_events[0].message_id == "open-msg-123" + + # Verify message_id is reset after + assert flow.message_id is None + + +def test_emit_tool_result_no_open_message(): + """Test _emit_tool_result works when there's no open text message.""" + from ag_ui.core import TextMessageEndEvent + + from agent_framework_ag_ui._run import FlowState, _emit_tool_result + + flow = FlowState() + # No open message + flow.message_id = None + flow.tool_call_id = "call_456" + + content = Content.from_function_result(call_id="call_456", result="tool result") + + events = _emit_tool_result(content, flow, predictive_handler=None) + + # Should have: ToolCallEndEvent, ToolCallResultEvent (no TextMessageEndEvent) + text_end_events = [e for e in events if isinstance(e, TextMessageEndEvent)] + assert len(text_end_events) == 0 + + def test_extract_approved_state_updates_no_handler(): """Test _extract_approved_state_updates returns empty with no handler.""" from agent_framework_ag_ui._run import _extract_approved_state_updates @@ -371,3 +423,143 @@ def test_extract_approved_state_updates_no_approval(): messages = [ChatMessage(role="user", contents=[Content.from_text("Hello")])] result = _extract_approved_state_updates(messages, handler) assert result == {} + + +class TestBuildMessagesSnapshot: + """Tests for _build_messages_snapshot function.""" + + def test_tool_calls_and_text_are_separate_messages(self): + """Test that tool calls and text content are emitted as separate messages. + + This is a regression test for issue #3619 where tool calls and content + were incorrectly merged into a single assistant message. + """ + from agent_framework_ag_ui._run import FlowState, _build_messages_snapshot + + flow = FlowState() + flow.message_id = "msg-123" + flow.pending_tool_calls = [ + {"id": "call_1", "function": {"name": "get_weather", "arguments": '{"city": "NYC"}'}}, + ] + flow.accumulated_text = "Here is the weather information." + flow.tool_results = [{"id": "result-1", "role": "tool", "content": '{"temp": 72}', "toolCallId": "call_1"}] + + result = _build_messages_snapshot(flow, []) + + # Should have 3 messages: tool call msg, tool result, text content msg + assert len(result.messages) == 3 + + # First message: assistant with tool calls only (no content) + assistant_tool_msg = result.messages[0] + assert assistant_tool_msg.role == "assistant" + assert assistant_tool_msg.tool_calls is not None + assert len(assistant_tool_msg.tool_calls) == 1 + assert assistant_tool_msg.content is None + + # Second message: tool result + tool_result_msg = result.messages[1] + assert tool_result_msg.role == "tool" + + # Third message: assistant with content only (no tool calls) + assistant_text_msg = result.messages[2] + assert assistant_text_msg.role == "assistant" + assert assistant_text_msg.content == "Here is the weather information." + assert assistant_text_msg.tool_calls is None + + # The text message should have a different ID than the tool call message + assert assistant_text_msg.id != assistant_tool_msg.id + + def test_only_tool_calls_no_text(self): + """Test snapshot with only tool calls and no accumulated text.""" + from agent_framework_ag_ui._run import FlowState, _build_messages_snapshot + + flow = FlowState() + flow.message_id = "msg-123" + flow.pending_tool_calls = [ + {"id": "call_1", "function": {"name": "get_weather", "arguments": "{}"}}, + ] + flow.accumulated_text = "" + flow.tool_results = [] + + result = _build_messages_snapshot(flow, []) + + # Should have 1 message: tool call msg only + assert len(result.messages) == 1 + assert result.messages[0].role == "assistant" + assert result.messages[0].tool_calls is not None + assert result.messages[0].content is None + + def test_only_text_no_tool_calls(self): + """Test snapshot with only text and no tool calls.""" + from agent_framework_ag_ui._run import FlowState, _build_messages_snapshot + + flow = FlowState() + flow.message_id = "msg-123" + flow.pending_tool_calls = [] + flow.accumulated_text = "Hello world" + flow.tool_results = [] + + result = _build_messages_snapshot(flow, []) + + # Should have 1 message: text content msg only + assert len(result.messages) == 1 + assert result.messages[0].role == "assistant" + assert result.messages[0].content == "Hello world" + assert result.messages[0].tool_calls is None + # Should use the existing message_id + assert result.messages[0].id == "msg-123" + + def test_preserves_snapshot_messages(self): + """Test that existing snapshot messages are preserved.""" + from agent_framework_ag_ui._run import FlowState, _build_messages_snapshot + + flow = FlowState() + flow.pending_tool_calls = [] + flow.accumulated_text = "" + + existing_messages = [ + {"id": "user-1", "role": "user", "content": "Hello"}, + {"id": "assist-1", "role": "assistant", "content": "Hi there"}, + ] + + result = _build_messages_snapshot(flow, existing_messages) + + assert len(result.messages) == 2 + assert result.messages[0].id == "user-1" + assert result.messages[1].id == "assist-1" + + +def test_malformed_json_in_confirm_args_skips_confirmation(): + """Test that malformed JSON in tool arguments skips confirm_changes flow. + + This is a regression test to ensure that when tool arguments contain malformed + JSON, the code skips the confirmation flow entirely rather than crashing or + showing incomplete data to the user. + """ + import json + + # Simulate the parsing logic - malformed JSON should trigger skip + malformed_arguments = "{ invalid json }" + tool_call = {"function": {"name": "write_doc", "arguments": malformed_arguments}} + + # This is what the code should do - detect parsing failure and skip + should_skip_confirmation = False + try: + json.loads(tool_call.get("function", {}).get("arguments", "{}")) + except json.JSONDecodeError: + should_skip_confirmation = True + + # Should skip confirmation when JSON is malformed + assert should_skip_confirmation is True + + # Valid JSON should proceed with confirmation + valid_arguments = '{"content": "hello"}' + tool_call_valid = {"function": {"name": "write_doc", "arguments": valid_arguments}} + should_skip_confirmation = False + try: + function_arguments = json.loads(tool_call_valid.get("function", {}).get("arguments", "{}")) + except json.JSONDecodeError: + should_skip_confirmation = True + + assert should_skip_confirmation is False + assert function_arguments == {"content": "hello"} From 76c70df5eeb665a27fd81203d6416b638764393c Mon Sep 17 00:00:00 2001 From: Evan Mattson Date: Tue, 3 Feb 2026 12:04:01 +0900 Subject: [PATCH 2/6] Fixes --- .../_message_adapters.py | 12 +- .../ag-ui/agent_framework_ag_ui/_run.py | 28 ++- .../agents/human_in_the_loop_agent.py | 40 +++- .../test_mcp_double_call_fix.py | 184 ++++++++++++++++++ .../ag-ui/tests/test_message_adapters.py | 12 +- 5 files changed, 263 insertions(+), 13 deletions(-) create mode 100644 python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py index cf189b0a4d..f798fadc86 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py +++ b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py @@ -452,11 +452,15 @@ def _filter_modified_args( merged_args["steps"] = merged_steps state_args = merged_args - # Keep the original tool call and AG-UI snapshot in sync with approved args. - updated_args = ( - json.dumps(merged_args) if isinstance(matching_func_call.arguments, str) else merged_args + # Update the ChatMessage tool call with only enabled steps (for LLM context). + # The LLM should only see the steps that were actually approved/executed. + updated_args_for_llm = ( + json.dumps(filtered_args) if isinstance(matching_func_call.arguments, str) else filtered_args ) - matching_func_call.arguments = updated_args + matching_func_call.arguments = updated_args_for_llm + + # Update raw messages with all steps + status (for MESSAGES_SNAPSHOT display). + # This allows the UI to show which steps were enabled/disabled. _update_tool_call_arguments(messages, str(approval_call_id), merged_args) # Create a new FunctionCallContent with the modified arguments func_call_for_approval = Content.from_function_call( diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_run.py b/python/packages/ag-ui/agent_framework_ag_ui/_run.py index e9b7554e17..943869cc48 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui/_run.py +++ b/python/packages/ag-ui/agent_framework_ag_ui/_run.py @@ -461,9 +461,21 @@ def _emit_approval_request( "function_arguments": make_json_safe(func_call.parse_arguments()) or {}, "steps": [{"description": f"Execute {func_name}", "status": "enabled"}], } - events.append(ToolCallArgsEvent(tool_call_id=confirm_id, delta=json.dumps(args))) + args_json = json.dumps(args) + events.append(ToolCallArgsEvent(tool_call_id=confirm_id, delta=args_json)) events.append(ToolCallEndEvent(tool_call_id=confirm_id)) + # Track confirm_changes in pending_tool_calls for MessagesSnapshotEvent + # The frontend needs to see this in the snapshot to render the confirmation dialog + confirm_entry = { + "id": confirm_id, + "type": "function", + "function": {"name": "confirm_changes", "arguments": args_json}, + } + flow.pending_tool_calls.append(confirm_entry) + flow.tool_calls_by_id[confirm_id] = confirm_entry + flow.tool_calls_ended.add(confirm_id) # Mark as ended since we emit End event + flow.waiting_for_approval = True return events @@ -1030,8 +1042,20 @@ async def run_agent_stream( "function_arguments": function_arguments, "steps": [{"description": f"Execute {tool_name}", "status": "enabled"}], } - yield ToolCallArgsEvent(tool_call_id=confirm_id, delta=json.dumps(confirm_args)) + confirm_args_json = json.dumps(confirm_args) + yield ToolCallArgsEvent(tool_call_id=confirm_id, delta=confirm_args_json) yield ToolCallEndEvent(tool_call_id=confirm_id) + + # Track confirm_changes in pending_tool_calls for MessagesSnapshotEvent + # The frontend needs to see this in the snapshot to render the confirmation dialog + confirm_entry = { + "id": confirm_id, + "type": "function", + "function": {"name": "confirm_changes", "arguments": confirm_args_json}, + } + flow.pending_tool_calls.append(confirm_entry) + flow.tool_calls_by_id[confirm_id] = confirm_entry + flow.tool_calls_ended.add(confirm_id) # Mark as ended since we emit End event flow.waiting_for_approval = True # Close any open message diff --git a/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py b/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py index 368c4e47ed..fde3fb56a7 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py +++ b/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py @@ -1,7 +1,14 @@ # Copyright (c) Microsoft. All rights reserved. -"""Human-in-the-loop agent demonstrating step customization (Feature 5).""" +"""Human-in-the-loop agent demonstrating step customization (Feature 5). +This agent also serves as a test case for the MCP tool double-call bug fix. +To test: call the agent twice with tasks that require approval (e.g., "Build a robot" +then "Build a house"). Before the fix, the second call would fail with: +"An assistant message with 'tool_calls' must be followed by tool messages..." +""" + +from datetime import datetime from enum import Enum from typing import Any @@ -23,6 +30,24 @@ class TaskStep(BaseModel): status: StepStatus = Field(default=StepStatus.ENABLED, description="Whether the step is enabled or disabled") +# Simple tool for quick testing of the double-call bug fix +@tool( + name="get_current_time", + description="Get the current date and time. Requires user approval.", + approval_mode="always_require", +) +def get_current_time() -> str: + """Get the current date and time. + + This is a simple tool for testing the approval flow. Call it multiple times + to verify the double-call bug fix works. + + Returns: + Current date and time string. + """ + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + @tool( name="generate_task_steps", description="Generate execution steps for a task", @@ -55,7 +80,13 @@ def human_in_the_loop_agent(chat_client: ChatClientProtocol[Any]) -> ChatAgent[A return ChatAgent( name="human_in_the_loop_agent", instructions="""You are a helpful assistant that can perform any task by breaking it down into steps. + You can also tell the user the current time. + + FOR TIME REQUESTS: + When the user asks "what time is it?" or similar, call the `get_current_time` function. + This is useful for testing the approval flow multiple times quickly. + FOR TASK PLANNING: When asked to perform a task, you MUST call the `generate_task_steps` function with the proper number of steps per the request. @@ -76,11 +107,10 @@ def human_in_the_loop_agent(chat_client: ChatClientProtocol[Any]) -> ChatAgent[A 9. "Calibrate systems" 10. "Final testing" - IMPORTANT: When you call generate_task_steps, the user will be shown the steps and asked to approve. + IMPORTANT: Both tools require user approval before execution. Do NOT output any text along with the function call - just call the function. - After the user approves and the function executes, THEN provide a brief acknowledgment like: - "The plan has been created with X steps selected." + After the user approves and the function executes, provide a brief response with the result. """, chat_client=chat_client, - tools=[generate_task_steps], + tools=[get_current_time, generate_task_steps], ) diff --git a/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py b/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py new file mode 100644 index 0000000000..1e3373ce04 --- /dev/null +++ b/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py @@ -0,0 +1,184 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Test sample for validating MCP tool double-call bug fix. + +This sample demonstrates the fix for the bug where calling an MCP tool (or any tool +with `approval_mode="always_require"`) twice would fail with: + + "An assistant message with 'tool_calls' must be followed by tool messages + responding to each 'tool_call_id'" + +The bug was caused by: +1. Tool results from approved tools ending up in `role="user"` messages instead of `role="tool"` +2. Tool call IDs not being removed from pending tracking after seeing their results + +To test this sample: +1. Set environment variables: + - AZURE_OPENAI_ENDPOINT + - AZURE_OPENAI_CHAT_DEPLOYMENT_NAME + +2. Run the server: + python -m getting_started.test_mcp_double_call_fix + +3. Use the AG-UI frontend or curl to test: + - First request: "What time is it?" -> Should work, shows approval dialog + - Approve the request + - Second request: "What's the date?" -> Before fix: ERROR, After fix: Works! +""" + +import logging +import os +from datetime import datetime + +from agent_framework import ChatAgent, tool +from agent_framework.ag_ui import AgentFrameworkAgent, add_agent_framework_fastapi_endpoint +from agent_framework.azure import AzureOpenAIChatClient +from dotenv import load_dotenv +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +load_dotenv() + +# Enable detailed logging to see the fix in action +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + +# Also enable AG-UI specific logging +logging.getLogger("agent_framework_ag_ui").setLevel(logging.INFO) + + +# Tool with approval_mode="always_require" - simulates MCP tool behavior +@tool( + name="get_datetime", + description="Get the current date and time. Requires user approval before execution.", + approval_mode="always_require", +) +def get_datetime() -> str: + """Get the current date and time. + + This tool requires user approval before execution, similar to MCP tools. + + Returns: + The current date and time as a formatted string. + """ + now = datetime.now() + result = now.strftime("%Y-%m-%d %H:%M:%S") + logger.info(f"get_datetime tool executed, returning: {result}") + return result + + +# Another approval-required tool to test multiple different tools +@tool( + name="get_system_info", + description="Get system information. Requires user approval before execution.", + approval_mode="always_require", +) +def get_system_info() -> str: + """Get basic system information. + + This tool also requires user approval, to test calling different approval tools. + + Returns: + Basic system information. + """ + import platform + + info = f"Python {platform.python_version()} on {platform.system()} {platform.release()}" + logger.info(f"get_system_info tool executed, returning: {info}") + return info + + +def create_test_agent() -> ChatAgent: + """Create the test agent with approval-required tools.""" + endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT") + deployment_name = os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME") + + if not endpoint: + raise ValueError("AZURE_OPENAI_ENDPOINT environment variable is required") + if not deployment_name: + raise ValueError("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME environment variable is required") + + return ChatAgent( + name="MCPTestAgent", + instructions="""You are a helpful assistant that can provide date/time and system information. + +When the user asks about time, date, or "what time is it", use the get_datetime tool. +When the user asks about system info, use the get_system_info tool. + +Important: These tools require user approval before execution. The user will see +an approval dialog and can choose to approve or reject the tool call. + +After a tool is approved and executed, provide a brief, friendly response with the result. +""", + chat_client=AzureOpenAIChatClient( + endpoint=endpoint, + deployment_name=deployment_name, + ), + tools=[get_datetime, get_system_info], + ) + + +# Create FastAPI app +app = FastAPI( + title="MCP Double-Call Bug Test", + description="Test server for validating the MCP tool double-call bug fix", +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Create the agent +agent = create_test_agent() + +# Register the AG-UI endpoint with require_confirmation=True (like the bug report) +add_agent_framework_fastapi_endpoint( + app=app, + agent=AgentFrameworkAgent( + agent=agent, + name="MCPTestAgent", + description="Test agent for MCP double-call bug fix validation", + require_confirmation=True, + ), + path="/", +) + + +@app.get("/health") +async def health_check(): + """Health check endpoint.""" + return {"status": "healthy", "message": "MCP double-call test server is running"} + + +def main(): + """Run the test server.""" + import uvicorn + + port = int(os.getenv("PORT", "8889")) + host = os.getenv("HOST", "127.0.0.1") + + print("\n" + "=" * 70) + print("MCP Double-Call Bug Fix Test Server") + print("=" * 70) + print(f"\nServer starting on http://{host}:{port}") + print("\nTo test the fix:") + print("1. Send: 'What time is it?' -> Approve -> Should work") + print("2. Send: 'What time is it?' again -> Approve -> Should also work!") + print(" (Before the fix, this second call would fail)") + print("\nEndpoints:") + print(f" - AG-UI: POST http://{host}:{port}/") + print(f" - Health: GET http://{host}:{port}/health") + print("=" * 70 + "\n") + + uvicorn.run(app, host=host, port=port, log_level="info") + + +if __name__ == "__main__": + main() diff --git a/python/packages/ag-ui/tests/test_message_adapters.py b/python/packages/ag-ui/tests/test_message_adapters.py index 4f6c3f1d42..15bb817be8 100644 --- a/python/packages/ag-ui/tests/test_message_adapters.py +++ b/python/packages/ag-ui/tests/test_message_adapters.py @@ -98,7 +98,14 @@ def test_agui_tool_result_to_agent_framework(): def test_agui_tool_approval_updates_tool_call_arguments(): - """Tool approval updates matching tool call arguments for snapshots and agent context.""" + """Tool approval updates matching tool call arguments for snapshots and agent context. + + The LLM context (ChatMessage) should contain only enabled steps, so the LLM + generates responses based on what was actually approved/executed. + + The raw messages (for MESSAGES_SNAPSHOT) should contain all steps with status, + so the UI can show which steps were enabled/disabled. + """ messages_input = [ { "role": "assistant", @@ -142,13 +149,14 @@ def test_agui_tool_approval_updates_tool_call_arguments(): assert len(messages) == 2 assistant_msg = messages[0] func_call = next(content for content in assistant_msg.contents if content.type == "function_call") + # LLM context should only have enabled steps (what was actually approved) assert func_call.arguments == { "steps": [ {"description": "Boil water", "status": "enabled"}, - {"description": "Brew coffee", "status": "disabled"}, {"description": "Serve coffee", "status": "enabled"}, ] } + # Raw messages (for MESSAGES_SNAPSHOT) should have all steps with status assert messages_input[0]["tool_calls"][0]["function"]["arguments"] == { "steps": [ {"description": "Boil water", "status": "enabled"}, From b8a4d71bf869fb8b454f7d578ef15178cb02b65f Mon Sep 17 00:00:00 2001 From: Evan Mattson Date: Tue, 3 Feb 2026 14:42:30 +0900 Subject: [PATCH 3/6] Fixes --- .../_message_adapters.py | 31 ++- .../ag-ui/agent_framework_ag_ui/_run.py | 4 + .../test_mcp_double_call_fix.py | 184 ------------------ .../ag-ui/tests/test_message_hygiene.py | 128 +++++++++--- python/packages/ag-ui/tests/test_run.py | 141 +++++++++++++- 5 files changed, 271 insertions(+), 217 deletions(-) delete mode 100644 python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py index f798fadc86..798569d2ca 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py +++ b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py @@ -45,7 +45,32 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]: confirm_changes_call = content break - sanitized.append(msg) + # Filter out confirm_changes from assistant messages before sending to LLM. + # confirm_changes is a synthetic tool for the approval UI flow - the LLM shouldn't + # see it because it may contain stale function_arguments that confuse the model + # (e.g., showing 5 steps when only 2 were approved). + # When we filter out confirm_changes, we also remove it from tool_ids and don't + # set pending_confirm_changes_id, so no synthetic result is injected for it. + # This is required because OpenAI validates that every tool result has a matching + # tool call in the previous assistant message. + if confirm_changes_call: + filtered_contents = [ + c for c in (msg.contents or []) if not (c.type == "function_call" and c.name == "confirm_changes") + ] + if filtered_contents: + # Create a new message without confirm_changes + msg = ChatMessage(role=msg.role, contents=filtered_contents) + sanitized.append(msg) + # If no contents left after filtering, don't append anything + + # Remove confirm_changes from tool_ids since we filtered it from the message + if confirm_changes_call.call_id: + tool_ids.discard(str(confirm_changes_call.call_id)) + # Don't set pending_confirm_changes_id - we don't want a synthetic result + confirm_changes_call = None + else: + sanitized.append(msg) + pending_tool_call_ids = tool_ids if tool_ids else None pending_confirm_changes_id = ( str(confirm_changes_call.call_id) if confirm_changes_call and confirm_changes_call.call_id else None @@ -455,7 +480,9 @@ def _filter_modified_args( # Update the ChatMessage tool call with only enabled steps (for LLM context). # The LLM should only see the steps that were actually approved/executed. updated_args_for_llm = ( - json.dumps(filtered_args) if isinstance(matching_func_call.arguments, str) else filtered_args + json.dumps(filtered_args) + if isinstance(matching_func_call.arguments, str) + else filtered_args ) matching_func_call.arguments = updated_args_for_llm diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_run.py b/python/packages/ag-ui/agent_framework_ag_ui/_run.py index 943869cc48..6457ce3ba8 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui/_run.py +++ b/python/packages/ag-ui/agent_framework_ag_ui/_run.py @@ -391,6 +391,7 @@ def _emit_tool_result( # This handles the case where a TextMessageStartEvent was emitted for tool-only # messages (Feature #4) but needs to be closed before starting a new message if flow.message_id: + logger.info(f"Closing text message (issue #3568 fix): message_id={flow.message_id}") events.append(TextMessageEndEvent(message_id=flow.message_id)) flow.message_id = None # Reset so next text content starts a new message @@ -920,6 +921,8 @@ async def run_agent_stream( # Emit events for each content item for content in update.contents: + content_type = getattr(content, "type", None) + logger.debug(f"Processing content type={content_type}, message_id={flow.message_id}") for event in _emit_content( content, flow, @@ -1060,6 +1063,7 @@ async def run_agent_stream( # Close any open message if flow.message_id: + logger.info(f"End of run: closing text message message_id={flow.message_id}") yield TextMessageEndEvent(message_id=flow.message_id) # Emit MessagesSnapshotEvent if we have tool calls or results diff --git a/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py b/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py deleted file mode 100644 index 1e3373ce04..0000000000 --- a/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py +++ /dev/null @@ -1,184 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -"""Test sample for validating MCP tool double-call bug fix. - -This sample demonstrates the fix for the bug where calling an MCP tool (or any tool -with `approval_mode="always_require"`) twice would fail with: - - "An assistant message with 'tool_calls' must be followed by tool messages - responding to each 'tool_call_id'" - -The bug was caused by: -1. Tool results from approved tools ending up in `role="user"` messages instead of `role="tool"` -2. Tool call IDs not being removed from pending tracking after seeing their results - -To test this sample: -1. Set environment variables: - - AZURE_OPENAI_ENDPOINT - - AZURE_OPENAI_CHAT_DEPLOYMENT_NAME - -2. Run the server: - python -m getting_started.test_mcp_double_call_fix - -3. Use the AG-UI frontend or curl to test: - - First request: "What time is it?" -> Should work, shows approval dialog - - Approve the request - - Second request: "What's the date?" -> Before fix: ERROR, After fix: Works! -""" - -import logging -import os -from datetime import datetime - -from agent_framework import ChatAgent, tool -from agent_framework.ag_ui import AgentFrameworkAgent, add_agent_framework_fastapi_endpoint -from agent_framework.azure import AzureOpenAIChatClient -from dotenv import load_dotenv -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware - -load_dotenv() - -# Enable detailed logging to see the fix in action -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", -) -logger = logging.getLogger(__name__) - -# Also enable AG-UI specific logging -logging.getLogger("agent_framework_ag_ui").setLevel(logging.INFO) - - -# Tool with approval_mode="always_require" - simulates MCP tool behavior -@tool( - name="get_datetime", - description="Get the current date and time. Requires user approval before execution.", - approval_mode="always_require", -) -def get_datetime() -> str: - """Get the current date and time. - - This tool requires user approval before execution, similar to MCP tools. - - Returns: - The current date and time as a formatted string. - """ - now = datetime.now() - result = now.strftime("%Y-%m-%d %H:%M:%S") - logger.info(f"get_datetime tool executed, returning: {result}") - return result - - -# Another approval-required tool to test multiple different tools -@tool( - name="get_system_info", - description="Get system information. Requires user approval before execution.", - approval_mode="always_require", -) -def get_system_info() -> str: - """Get basic system information. - - This tool also requires user approval, to test calling different approval tools. - - Returns: - Basic system information. - """ - import platform - - info = f"Python {platform.python_version()} on {platform.system()} {platform.release()}" - logger.info(f"get_system_info tool executed, returning: {info}") - return info - - -def create_test_agent() -> ChatAgent: - """Create the test agent with approval-required tools.""" - endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT") - deployment_name = os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME") - - if not endpoint: - raise ValueError("AZURE_OPENAI_ENDPOINT environment variable is required") - if not deployment_name: - raise ValueError("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME environment variable is required") - - return ChatAgent( - name="MCPTestAgent", - instructions="""You are a helpful assistant that can provide date/time and system information. - -When the user asks about time, date, or "what time is it", use the get_datetime tool. -When the user asks about system info, use the get_system_info tool. - -Important: These tools require user approval before execution. The user will see -an approval dialog and can choose to approve or reject the tool call. - -After a tool is approved and executed, provide a brief, friendly response with the result. -""", - chat_client=AzureOpenAIChatClient( - endpoint=endpoint, - deployment_name=deployment_name, - ), - tools=[get_datetime, get_system_info], - ) - - -# Create FastAPI app -app = FastAPI( - title="MCP Double-Call Bug Test", - description="Test server for validating the MCP tool double-call bug fix", -) - -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# Create the agent -agent = create_test_agent() - -# Register the AG-UI endpoint with require_confirmation=True (like the bug report) -add_agent_framework_fastapi_endpoint( - app=app, - agent=AgentFrameworkAgent( - agent=agent, - name="MCPTestAgent", - description="Test agent for MCP double-call bug fix validation", - require_confirmation=True, - ), - path="/", -) - - -@app.get("/health") -async def health_check(): - """Health check endpoint.""" - return {"status": "healthy", "message": "MCP double-call test server is running"} - - -def main(): - """Run the test server.""" - import uvicorn - - port = int(os.getenv("PORT", "8889")) - host = os.getenv("HOST", "127.0.0.1") - - print("\n" + "=" * 70) - print("MCP Double-Call Bug Fix Test Server") - print("=" * 70) - print(f"\nServer starting on http://{host}:{port}") - print("\nTo test the fix:") - print("1. Send: 'What time is it?' -> Approve -> Should work") - print("2. Send: 'What time is it?' again -> Approve -> Should also work!") - print(" (Before the fix, this second call would fail)") - print("\nEndpoints:") - print(f" - AG-UI: POST http://{host}:{port}/") - print(f" - Health: GET http://{host}:{port}/health") - print("=" * 70 + "\n") - - uvicorn.run(app, host=host, port=port, log_level="info") - - -if __name__ == "__main__": - main() diff --git a/python/packages/ag-ui/tests/test_message_hygiene.py b/python/packages/ag-ui/tests/test_message_hygiene.py index eb55965ab9..b2f74e60a7 100644 --- a/python/packages/ag-ui/tests/test_message_hygiene.py +++ b/python/packages/ag-ui/tests/test_message_hygiene.py @@ -5,7 +5,13 @@ from agent_framework_ag_ui._message_adapters import _deduplicate_messages, _sanitize_tool_history -def test_sanitize_tool_history_injects_confirm_changes_result() -> None: +def test_sanitize_tool_history_filters_out_confirm_changes_only_message() -> None: + """Test that assistant messages with ONLY confirm_changes are filtered out entirely. + + When an assistant message contains only a confirm_changes tool call (no other tools), + the entire message should be filtered out because confirm_changes is a synthetic + tool for the approval UI flow that shouldn't be sent to the LLM. + """ messages = [ ChatMessage( role="assistant", @@ -25,12 +31,17 @@ def test_sanitize_tool_history_injects_confirm_changes_result() -> None: sanitized = _sanitize_tool_history(messages) + # Assistant message with only confirm_changes should be filtered out + assistant_messages = [ + msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "assistant" + ] + assert len(assistant_messages) == 0 + + # No synthetic tool result should be injected since confirm_changes was filtered out tool_messages = [ msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "tool" ] - assert len(tool_messages) == 1 - assert str(tool_messages[0].contents[0].call_id) == "call_confirm_123" - assert tool_messages[0].contents[0].result == "Confirmed" + assert len(tool_messages) == 0 def test_deduplicate_messages_prefers_non_empty_tool_results() -> None: @@ -133,22 +144,20 @@ def test_convert_approval_results_preserves_other_user_content() -> None: assert messages[2].contents[0].type == "function_result" -def test_sanitize_tool_history_multiple_requests_with_confirm_changes() -> None: - """Test that confirm_changes is properly handled across multiple requests. +def test_sanitize_tool_history_filters_confirm_changes_keeps_other_tools() -> None: + """Test that confirm_changes is filtered but other tools are preserved. - This is a regression test for the MCP tool double-call bug. When a second - MCP tool call happens after the first one completed, the message history - should have proper synthetic results for all confirm_changes calls. + When an assistant message contains both a real tool call and confirm_changes, + confirm_changes should be filtered out while the real tool call is kept. + No synthetic result is injected for confirm_changes since it's filtered. """ - # Simulate history from first request (already completed) - # Note: confirm_changes synthetic result might be missing from frontend's snapshot messages = [ - # First request - user asks something + # User asks something ChatMessage( role="user", contents=[Content.from_text(text="What time is it?")], ), - # First request - assistant calls MCP tool + confirm_changes + # Assistant calls MCP tool + confirm_changes ChatMessage( role="assistant", contents=[ @@ -156,14 +165,12 @@ def test_sanitize_tool_history_multiple_requests_with_confirm_changes() -> None: Content.from_function_call(call_id="call_c1", name="confirm_changes", arguments="{}"), ], ), - # First request - tool result for the actual MCP tool + # Tool result for the actual MCP tool ChatMessage( role="tool", contents=[Content.from_function_result(call_id="call_1", result="2024-01-01 12:00:00")], ), - # Note: NO tool result for call_c1 (confirm_changes) - this is the bug scenario! - # The synthetic was injected in request 1 but not persisted in snapshot - # Second request - user asks something else + # User asks something else ChatMessage( role="user", contents=[Content.from_text(text="What's the date?")], @@ -172,14 +179,91 @@ def test_sanitize_tool_history_multiple_requests_with_confirm_changes() -> None: sanitized = _sanitize_tool_history(messages) - # After sanitization, call_c1 should have a synthetic result + # Find the assistant message + assistant_messages = [ + msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "assistant" + ] + assert len(assistant_messages) == 1 + + # Assistant message should only have get_datetime, not confirm_changes + function_call_names = [c.name for c in assistant_messages[0].contents if c.type == "function_call"] + assert "get_datetime" in function_call_names + assert "confirm_changes" not in function_call_names + + # Only one tool message (for call_1), no synthetic for confirm_changes tool_messages = [ msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "tool" ] + assert len(tool_messages) == 1 + assert str(tool_messages[0].contents[0].call_id) == "call_1" + - # Should have 2 tool messages: one for call_1 (real) and one for call_c1 (synthetic) - assert len(tool_messages) == 2 +def test_sanitize_tool_history_filters_confirm_changes_from_assistant_messages() -> None: + """Test that confirm_changes is removed from assistant messages sent to LLM. + This is a regression test for the human-in-the-loop bug where the LLM would see + confirm_changes with function_arguments containing the original steps (e.g., 5 steps) + even when the user only approved a subset (e.g., 2 steps), causing the LLM to + respond with "Here's your 5-step plan" instead of "Here's your 2-step plan". + """ + messages = [ + ChatMessage( + role="user", + contents=[Content.from_text(text="Build a robot")], + ), + # Assistant message with both generate_task_steps and confirm_changes + ChatMessage( + role="assistant", + contents=[ + Content.from_function_call( + call_id="call_1", + name="generate_task_steps", + arguments='{"steps": [{"description": "Step 1"}, {"description": "Step 2"}]}', + ), + Content.from_function_call( + call_id="call_c1", + name="confirm_changes", + arguments='{"function_arguments": {"steps": [{"description": "Step 1"}, {"description": "Step 2"}]}}', + ), + ], + ), + # Approval response + ChatMessage( + role="user", + contents=[ + Content.from_function_approval_response( + approved=True, + id="call_1", + function_call=Content.from_function_call( + call_id="call_1", + name="generate_task_steps", + arguments='{"steps": [{"description": "Step 1"}]}', # Only 1 step approved + ), + ), + ], + ), + ] + + sanitized = _sanitize_tool_history(messages) + + # Find the assistant message in sanitized output + assistant_messages = [ + msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "assistant" + ] + + assert len(assistant_messages) == 1 + + # The assistant message should NOT contain confirm_changes + assistant_contents = assistant_messages[0].contents or [] + function_call_names = [c.name for c in assistant_contents if c.type == "function_call"] + assert "generate_task_steps" in function_call_names + assert "confirm_changes" not in function_call_names + + # No synthetic tool result for confirm_changes (it was filtered from the message) + tool_messages = [ + msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "tool" + ] + # No tool results expected since there are no completed tool calls + # (the approval response is handled separately by the framework) tool_call_ids = {str(msg.contents[0].call_id) for msg in tool_messages} - assert "call_1" in tool_call_ids - assert "call_c1" in tool_call_ids + assert "call_c1" not in tool_call_ids # No synthetic result for confirm_changes diff --git a/python/packages/ag-ui/tests/test_run.py b/python/packages/ag-ui/tests/test_run.py index 5cfcfe7619..d5198f8dc6 100644 --- a/python/packages/ag-ui/tests/test_run.py +++ b/python/packages/ag-ui/tests/test_run.py @@ -2,12 +2,18 @@ """Tests for _run.py helper functions and FlowState.""" +from ag_ui.core import ( + TextMessageEndEvent, + TextMessageStartEvent, +) from agent_framework import ChatMessage, Content from agent_framework_ag_ui._run import ( FlowState, _build_safe_metadata, _create_state_context_message, + _emit_content, + _emit_tool_result, _has_only_tool_calls, _inject_state_context, _should_suppress_intermediate_snapshot, @@ -356,14 +362,10 @@ def test_emit_tool_call_generates_id(): def test_emit_tool_result_closes_open_message(): """Test _emit_tool_result emits TextMessageEndEvent for open text message. - This is a regression test for issue #3568 where TEXT_MESSAGE_END was not + This is a regression test for where TEXT_MESSAGE_END was not emitted when using MCP tools because the message_id was reset without closing the message first. """ - from ag_ui.core import TextMessageEndEvent - - from agent_framework_ag_ui._run import FlowState, _emit_tool_result - flow = FlowState() # Simulate an open text message (e.g., from Feature #4 tool-only detection) flow.message_id = "open-msg-123" @@ -387,10 +389,6 @@ def test_emit_tool_result_closes_open_message(): def test_emit_tool_result_no_open_message(): """Test _emit_tool_result works when there's no open text message.""" - from ag_ui.core import TextMessageEndEvent - - from agent_framework_ag_ui._run import FlowState, _emit_tool_result - flow = FlowState() # No open message flow.message_id = None @@ -563,3 +561,128 @@ def test_malformed_json_in_confirm_args_skips_confirmation(): assert should_skip_confirmation is False assert function_arguments == {"content": "hello"} + + +class TestTextMessageEventBalancing: + """Tests for proper TEXT_MESSAGE_START/END event balancing. + + These tests verify that the streaming flow produces balanced pairs of + TextMessageStartEvent and TextMessageEndEvent, especially when tool + execution is involved. + """ + + def test_tool_only_flow_produces_balanced_events(self): + """Test that a tool-only response produces balanced TEXT_MESSAGE events. + + This simulates the scenario where the LLM immediately calls a tool + without any initial text, then returns text after the tool result. + """ + flow = FlowState() + all_events: list = [] + + # Step 1: LLM outputs function_call only (no text) + func_call_content = Content.from_function_call( + call_id="call_weather", + name="get_weather", + arguments='{"city": "Seattle"}', + ) + + # Feature #4 check: this should trigger TextMessageStartEvent + contents = [func_call_content] + if not flow.message_id and _has_only_tool_calls(contents): + flow.message_id = "tool-msg-1" + all_events.append(TextMessageStartEvent(message_id=flow.message_id, role="assistant")) + + # Emit tool call events + all_events.extend(_emit_content(func_call_content, flow)) + + # Step 2: Tool executes and returns result + func_result_content = Content.from_function_result( + call_id="call_weather", + result='{"temp": 55, "conditions": "rainy"}', + ) + + # This should close the text message + all_events.extend(_emit_tool_result(func_result_content, flow)) + + # Verify message_id was reset + assert flow.message_id is None, "message_id should be reset after tool result" + + # Step 3: LLM outputs text response + text_content = Content.from_text("The weather in Seattle is 55°F and rainy.") + + # Since message_id is None, _emit_text should create a new one + for event in _emit_content(text_content, flow): + all_events.append(event) + + # Step 4: End of stream - emit final TextMessageEndEvent + if flow.message_id: + all_events.append(TextMessageEndEvent(message_id=flow.message_id)) + + # Verify event counts + start_events = [e for e in all_events if isinstance(e, TextMessageStartEvent)] + end_events = [e for e in all_events if isinstance(e, TextMessageEndEvent)] + + # Should have 2 TextMessageStartEvent and 2 TextMessageEndEvent + assert len(start_events) == 2, f"Expected 2 start events, got {len(start_events)}" + assert len(end_events) == 2, f"Expected 2 end events, got {len(end_events)}" + + # Verify order: first message should start and end before second starts + # Find indices + start_indices = [i for i, e in enumerate(all_events) if isinstance(e, TextMessageStartEvent)] + end_indices = [i for i, e in enumerate(all_events) if isinstance(e, TextMessageEndEvent)] + + # First end should come before second start + assert end_indices[0] < start_indices[1], ( + f"First TextMessageEndEvent (index {end_indices[0]}) should come " + f"before second TextMessageStartEvent (index {start_indices[1]})" + ) + + def test_text_then_tool_flow(self): + """Test flow where LLM outputs text first, then calls a tool. + + This simulates: "Let me check the weather..." -> tool call -> tool result -> "The weather is..." + """ + flow = FlowState() + all_events: list = [] + + # Step 1: LLM outputs text first + text1 = Content.from_text("Let me check the weather for you.") + all_events.extend(_emit_content(text1, flow)) + + # Verify message_id is set + assert flow.message_id is not None, "message_id should be set after text" + first_msg_id = flow.message_id + + # Step 2: LLM outputs function_call + func_call = Content.from_function_call( + call_id="call_1", + name="get_weather", + arguments="{}", + ) + all_events.extend(_emit_content(func_call, flow)) + + # Step 3: Tool result comes back + func_result = Content.from_function_result(call_id="call_1", result="sunny") + all_events.extend(_emit_tool_result(func_result, flow)) + + # Verify message_id was reset and first message was closed + assert flow.message_id is None + end_events_so_far = [e for e in all_events if isinstance(e, TextMessageEndEvent)] + assert len(end_events_so_far) == 1 + assert end_events_so_far[0].message_id == first_msg_id + + # Step 4: LLM outputs follow-up text + text2 = Content.from_text("The weather is sunny!") + all_events.extend(_emit_content(text2, flow)) + + # Step 5: End of stream + if flow.message_id: + all_events.append(TextMessageEndEvent(message_id=flow.message_id)) + + # Verify balance + start_events = [e for e in all_events if isinstance(e, TextMessageStartEvent)] + end_events = [e for e in all_events if isinstance(e, TextMessageEndEvent)] + + assert len(start_events) == 2 + assert len(end_events) == 2 From a8e11ad16df41bf4d704463a1233b68564913d4f Mon Sep 17 00:00:00 2001 From: Evan Mattson Date: Tue, 3 Feb 2026 14:43:56 +0900 Subject: [PATCH 4/6] Revert human_in_the_loop_agent.py changes --- .../agents/human_in_the_loop_agent.py | 40 +++---------------- 1 file changed, 5 insertions(+), 35 deletions(-) diff --git a/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py b/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py index fde3fb56a7..368c4e47ed 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py +++ b/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py @@ -1,14 +1,7 @@ # Copyright (c) Microsoft. All rights reserved. -"""Human-in-the-loop agent demonstrating step customization (Feature 5). +"""Human-in-the-loop agent demonstrating step customization (Feature 5).""" -This agent also serves as a test case for the MCP tool double-call bug fix. -To test: call the agent twice with tasks that require approval (e.g., "Build a robot" -then "Build a house"). Before the fix, the second call would fail with: -"An assistant message with 'tool_calls' must be followed by tool messages..." -""" - -from datetime import datetime from enum import Enum from typing import Any @@ -30,24 +23,6 @@ class TaskStep(BaseModel): status: StepStatus = Field(default=StepStatus.ENABLED, description="Whether the step is enabled or disabled") -# Simple tool for quick testing of the double-call bug fix -@tool( - name="get_current_time", - description="Get the current date and time. Requires user approval.", - approval_mode="always_require", -) -def get_current_time() -> str: - """Get the current date and time. - - This is a simple tool for testing the approval flow. Call it multiple times - to verify the double-call bug fix works. - - Returns: - Current date and time string. - """ - return datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - @tool( name="generate_task_steps", description="Generate execution steps for a task", @@ -80,13 +55,7 @@ def human_in_the_loop_agent(chat_client: ChatClientProtocol[Any]) -> ChatAgent[A return ChatAgent( name="human_in_the_loop_agent", instructions="""You are a helpful assistant that can perform any task by breaking it down into steps. - You can also tell the user the current time. - - FOR TIME REQUESTS: - When the user asks "what time is it?" or similar, call the `get_current_time` function. - This is useful for testing the approval flow multiple times quickly. - FOR TASK PLANNING: When asked to perform a task, you MUST call the `generate_task_steps` function with the proper number of steps per the request. @@ -107,10 +76,11 @@ def human_in_the_loop_agent(chat_client: ChatClientProtocol[Any]) -> ChatAgent[A 9. "Calibrate systems" 10. "Final testing" - IMPORTANT: Both tools require user approval before execution. + IMPORTANT: When you call generate_task_steps, the user will be shown the steps and asked to approve. Do NOT output any text along with the function call - just call the function. - After the user approves and the function executes, provide a brief response with the result. + After the user approves and the function executes, THEN provide a brief acknowledgment like: + "The plan has been created with X steps selected." """, chat_client=chat_client, - tools=[get_current_time, generate_task_steps], + tools=[generate_task_steps], ) From ba82b672ed91012023d535022ecb3a3ca5721dcf Mon Sep 17 00:00:00 2001 From: Evan Mattson Date: Tue, 3 Feb 2026 15:27:31 +0900 Subject: [PATCH 5/6] Address copilot feedback --- .../packages/ag-ui/agent_framework_ag_ui/_message_adapters.py | 4 ++-- python/packages/ag-ui/agent_framework_ag_ui/_run.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py index 798569d2ca..0187de012e 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py +++ b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py @@ -58,8 +58,8 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]: c for c in (msg.contents or []) if not (c.type == "function_call" and c.name == "confirm_changes") ] if filtered_contents: - # Create a new message without confirm_changes - msg = ChatMessage(role=msg.role, contents=filtered_contents) + # Update the existing message without confirm_changes, preserving metadata + msg.contents = filtered_contents sanitized.append(msg) # If no contents left after filtering, don't append anything diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_run.py b/python/packages/ag-ui/agent_framework_ag_ui/_run.py index 6457ce3ba8..3c06880436 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui/_run.py +++ b/python/packages/ag-ui/agent_framework_ag_ui/_run.py @@ -391,7 +391,7 @@ def _emit_tool_result( # This handles the case where a TextMessageStartEvent was emitted for tool-only # messages (Feature #4) but needs to be closed before starting a new message if flow.message_id: - logger.info(f"Closing text message (issue #3568 fix): message_id={flow.message_id}") + logger.debug("Closing text message (issue #3568 fix): message_id=%s", flow.message_id) events.append(TextMessageEndEvent(message_id=flow.message_id)) flow.message_id = None # Reset so next text content starts a new message From 47693369ae90afe82118069fb7354068358bf031 Mon Sep 17 00:00:00 2001 From: Evan Mattson Date: Wed, 4 Feb 2026 07:44:31 +0900 Subject: [PATCH 6/6] PR feedback addressed --- .../_message_adapters.py | 8 +- .../ag-ui/agent_framework_ag_ui/_run.py | 98 ++++++++----------- .../ag-ui/tests/test_message_hygiene.py | 17 ++-- 3 files changed, 53 insertions(+), 70 deletions(-) diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py index 0187de012e..79a07edd02 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py +++ b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py @@ -58,9 +58,9 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]: c for c in (msg.contents or []) if not (c.type == "function_call" and c.name == "confirm_changes") ] if filtered_contents: - # Update the existing message without confirm_changes, preserving metadata - msg.contents = filtered_contents - sanitized.append(msg) + # Create a new message without confirm_changes to avoid mutating the input + filtered_msg = ChatMessage(role=msg.role, contents=filtered_contents) + sanitized.append(filtered_msg) # If no contents left after filtering, don't append anything # Remove confirm_changes from tool_ids since we filtered it from the message @@ -119,6 +119,8 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]: user_text = content.text # type: ignore[assignment] break + if not user_text: + continue try: parsed = json.loads(user_text) # type: ignore[arg-type] if "accepted" in parsed: diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_run.py b/python/packages/ag-ui/agent_framework_ag_ui/_run.py index 3c06880436..1195d3b42f 100644 --- a/python/packages/ag-ui/agent_framework_ag_ui/_run.py +++ b/python/packages/ag-ui/agent_framework_ag_ui/_run.py @@ -516,7 +516,7 @@ def _is_confirm_changes_response(messages: list[Any]) -> bool: # Parse the content to check if it has the confirm_changes structure for content in last.contents: - if getattr(content, "type", None) == "text": + if getattr(content, "type", None) == "text" and content.text: try: result = json.loads(content.text) # confirm_changes results have 'accepted' and 'steps' keys @@ -536,31 +536,34 @@ def _handle_step_based_approval(messages: list[Any]) -> list[BaseEvent]: # Parse the approval content approval_text = "" for content in last.contents: - if getattr(content, "type", None) == "text": + if getattr(content, "type", None) == "text" and content.text: approval_text = content.text break - try: - result = json.loads(approval_text) - accepted = result.get("accepted", False) - steps = result.get("steps", []) - - if accepted: - # Generate acceptance message with step descriptions - enabled_steps = [s for s in steps if s.get("status") == "enabled"] - if enabled_steps: - message_parts = [f"Executing {len(enabled_steps)} approved steps:\n\n"] - for i, step in enumerate(enabled_steps, 1): - message_parts.append(f"{i}. {step.get('description', 'Step')}\n") - message_parts.append("\nAll steps completed successfully!") - message = "".join(message_parts) - else: - message = "Changes confirmed and applied successfully!" - else: - # Rejection message - message = "No problem! What would you like me to change about the plan?" - except json.JSONDecodeError: + if not approval_text: message = "Acknowledged." + else: + try: + result = json.loads(approval_text) + accepted = result.get("accepted", False) + steps = result.get("steps", []) + + if accepted: + # Generate acceptance message with step descriptions + enabled_steps = [s for s in steps if s.get("status") == "enabled"] + if enabled_steps: + message_parts = [f"Executing {len(enabled_steps)} approved steps:\n\n"] + for i, step in enumerate(enabled_steps, 1): + message_parts.append(f"{i}. {step.get('description', 'Step')}\n") + message_parts.append("\nAll steps completed successfully!") + message = "".join(message_parts) + else: + message = "Changes confirmed and applied successfully!" + else: + # Rejection message + message = "No problem! What would you like me to change about the plan?" + except json.JSONDecodeError: + message = "Acknowledged." message_id = generate_event_id() events.append(TextMessageStartEvent(message_id=message_id, role="assistant")) @@ -661,56 +664,33 @@ def _convert_approval_results_to_tool_messages(messages: list[Any]) -> None: Args: messages: List of ChatMessage objects to process """ - i = 0 - while i < len(messages): - msg = messages[i] - role_value = get_role_value(msg) + result: list[Any] = [] - if role_value != "user": - i += 1 + for msg in messages: + if get_role_value(msg) != "user": + result.append(msg) continue - # Check if this user message has function_result content - function_results: list[Content] = [] - other_contents: list[Any] = [] - - for content in msg.contents or []: - if getattr(content, "type", None) == "function_result": - function_results.append(content) - else: - other_contents.append(content) + function_results = [c for c in (msg.contents or []) if getattr(c, "type", None) == "function_result"] + other_contents = [c for c in (msg.contents or []) if getattr(c, "type", None) != "function_result"] if not function_results: - i += 1 + result.append(msg) continue - # We have function results in a user message - need to fix this logger.info( f"Converting {len(function_results)} function_result content(s) from user message to tool message(s)" ) - # Create tool messages for each function result - new_tool_messages = [] + # Tool messages first (right after the preceding assistant message per OpenAI requirements) for func_result in function_results: - tool_msg = ChatMessage( - role="tool", - contents=[func_result], - ) - new_tool_messages.append(tool_msg) + result.append(ChatMessage(role="tool", contents=[func_result])) + # Then user message with remaining content (if any) if other_contents: - # Keep the user message with remaining contents - msg.contents = other_contents - # Insert tool messages after this user message - for j, tool_msg in enumerate(new_tool_messages): - messages.insert(i + 1 + j, tool_msg) - i += 1 + len(new_tool_messages) - else: - # No other contents - replace user message with tool messages - messages.pop(i) - for j, tool_msg in enumerate(new_tool_messages): - messages.insert(i + j, tool_msg) - i += len(new_tool_messages) + result.append(ChatMessage(role=msg.role, contents=other_contents)) + + messages[:] = result def _build_messages_snapshot( @@ -1063,7 +1043,7 @@ async def run_agent_stream( # Close any open message if flow.message_id: - logger.info(f"End of run: closing text message message_id={flow.message_id}") + logger.debug(f"End of run: closing text message message_id={flow.message_id}") yield TextMessageEndEvent(message_id=flow.message_id) # Emit MessagesSnapshotEvent if we have tool calls or results diff --git a/python/packages/ag-ui/tests/test_message_hygiene.py b/python/packages/ag-ui/tests/test_message_hygiene.py index b2f74e60a7..70567d00ab 100644 --- a/python/packages/ag-ui/tests/test_message_hygiene.py +++ b/python/packages/ag-ui/tests/test_message_hygiene.py @@ -128,20 +128,21 @@ def test_convert_approval_results_preserves_other_user_content() -> None: _convert_approval_results_to_tool_messages(messages) - # Should have 3 messages now: assistant, user (with text), tool (with result) + # Should have 3 messages now: assistant, tool (with result), user (with text) + # OpenAI requires tool messages immediately after the assistant message with the tool call assert len(messages) == 3 # First message unchanged assert messages[0].role.value == "assistant" - # Second message should be user with just text - assert messages[1].role.value == "user" - assert len(messages[1].contents) == 1 - assert messages[1].contents[0].type == "text" + # Second message should be tool with result (must come right after assistant per OpenAI requirements) + assert messages[1].role.value == "tool" + assert messages[1].contents[0].type == "function_result" - # Third message should be tool with result - assert messages[2].role.value == "tool" - assert messages[2].contents[0].type == "function_result" + # Third message should be user with just text + assert messages[2].role.value == "user" + assert len(messages[2].contents) == 1 + assert messages[2].contents[0].type == "text" def test_sanitize_tool_history_filters_confirm_changes_keeps_other_tools() -> None: