From 2da8c136a28c77d8960fd9f652599f8245967b66 Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Tue, 3 Feb 2026 11:38:19 +0900
Subject: [PATCH 1/6] AG-UI bug fixes

---
 .../_message_adapters.py                      |  16 +-
 .../ag-ui/agent_framework_ag_ui/_run.py       | 115 ++++++++++-
 .../ag-ui/tests/test_message_hygiene.py       | 135 ++++++++++++
 python/packages/ag-ui/tests/test_run.py       | 192 ++++++++++++++++++
 4 files changed, 442 insertions(+), 16 deletions(-)

diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
index f8f1623a30..cf189b0a4d 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
@@ -67,7 +67,7 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]:
             if approval_call_ids and pending_tool_call_ids:
                 pending_tool_call_ids -= approval_call_ids
                 logger.info(
-                    f"FunctionApprovalResponseContent found for call_ids={sorted(approval_call_ids)} - "
+                    f"function_approval_response content found for call_ids={sorted(approval_call_ids)} - "
                     "framework will handle execution"
                 )
 
@@ -150,6 +150,10 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]:
                     call_id = str(content.call_id)
                     if call_id in pending_tool_call_ids:
                         keep = True
+                        # Remove the call_id from pending since we now have its result.
+                        # This prevents duplicate synthetic "skipped" results from being
+                        # injected when a user message arrives later.
+                        pending_tool_call_ids.discard(call_id)
                         if call_id == pending_confirm_changes_id:
                             pending_confirm_changes_id = None
                         break
@@ -338,7 +342,7 @@ def _filter_modified_args(
     result: list[ChatMessage] = []
     for msg in messages:
         # Handle standard tool result messages early (role="tool") to preserve provider invariants
-        # This path maps AG‑UI tool messages to FunctionResultContent with the correct tool_call_id
+        # This path maps AG‑UI tool messages to function_result content with the correct tool_call_id
         role_str = normalize_agui_role(msg.get("role", "user"))
         if role_str == "tool":
             # Prefer explicit tool_call_id fields; fall back to backend fields only if necessary
@@ -371,7 +375,7 @@ def _filter_modified_args(
 
             if is_approval:
                 # Look for the matching function call in previous messages to create
-                # a proper FunctionApprovalResponseContent. This enables the agent framework
+                # proper function_approval_response content. This enables the agent framework
                 # to execute the approved tool (fix for GitHub issue #3034).
                 accepted = parsed.get("accepted", False) if parsed is not None else False
                 approval_payload_text = result_content if isinstance(result_content, str) else json.dumps(parsed)
@@ -465,7 +469,7 @@ def _filter_modified_args(
                         # No modified arguments - use the original function call
                         func_call_for_approval = matching_func_call
 
-                    # Create FunctionApprovalResponseContent for the agent framework
+                    # Create function_approval_response content for the agent framework
                     approval_response = Content.from_function_approval_response(
                         approved=accepted,
                         id=str(approval_call_id),
@@ -489,7 +493,7 @@ def _filter_modified_args(
                 result.append(chat_msg)
                 continue
 
-            # Cast result_content to acceptable type for FunctionResultContent
+            # Cast result_content to acceptable type for function_result content
             func_result: str | dict[str, Any] | list[Any]
             if isinstance(result_content, str):
                 func_result = result_content
@@ -566,7 +570,7 @@ def _filter_modified_args(
 
         # Check if this message contains function approvals
         if "function_approvals" in msg and msg["function_approvals"]:
-            # Convert function approvals to FunctionApprovalResponseContent
+            # Convert function approvals to function_approval_response content
             approval_contents: list[Any] = []
             for approval in msg["function_approvals"]:
                 # Create FunctionCallContent with the modified arguments
diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_run.py b/python/packages/ag-ui/agent_framework_ag_ui/_run.py
index d1229620a7..e9b7554e17 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui/_run.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui/_run.py
@@ -45,6 +45,7 @@
     convert_agui_tools_to_agent_framework,
     generate_event_id,
     get_conversation_id_from_update,
+    get_role_value,
     make_json_safe,
 )
 
@@ -344,7 +345,7 @@ def _emit_tool_result(
     flow: FlowState,
     predictive_handler: PredictiveStateHandler | None = None,
 ) -> list[BaseEvent]:
-    """Emit ToolCallResult events for FunctionResultContent."""
+    """Emit ToolCallResult events for function_result content."""
     events: list[BaseEvent] = []
 
     # Cannot emit tool result without a call_id to associate it with
@@ -385,6 +386,12 @@ def _emit_tool_result(
     # After tool result, any subsequent text should start a new message
     flow.tool_call_id = None
     flow.tool_call_name = None
+
+    # Close any open text message before resetting message_id (issue #3568)
+    # This handles the case where a TextMessageStartEvent was emitted for tool-only
+    # messages (Feature #4) but needs to be closed before starting a new message
+    if flow.message_id:
+        events.append(TextMessageEndEvent(message_id=flow.message_id))
     flow.message_id = None  # Reset so next text content starts a new message
 
     return events
@@ -558,8 +565,8 @@ async def _resolve_approval_responses(
 ) -> None:
     """Execute approved function calls and replace approval content with results.
 
-    This modifies the messages list in place, replacing FunctionApprovalResponseContent
-    with FunctionResultContent containing the actual tool execution result.
+    This modifies the messages list in place, replacing function_approval_response
+    content with function_result content containing the actual tool execution result.
 
     Args:
         messages: List of messages (will be modified in place)
@@ -622,6 +629,76 @@ async def _resolve_approval_responses(
 
     _replace_approval_contents_with_results(messages, fcc_todo, normalized_results)  # type: ignore
 
+    # Post-process: Convert user messages with function_result content to proper tool messages.
+    # After _replace_approval_contents_with_results, approved tool calls have their results
+    # placed in user messages. OpenAI requires tool results to be in role="tool" messages.
+    # This transformation ensures the message history is valid for the LLM provider.
+    _convert_approval_results_to_tool_messages(messages)
+
+
+def _convert_approval_results_to_tool_messages(messages: list[Any]) -> None:
+    """Convert function_result content in user messages to proper tool messages.
+
+    After approval processing, tool results end up in user messages. OpenAI and other
+    providers require tool results to be in role="tool" messages. This function
+    extracts function_result content from user messages and creates proper tool messages.
+
+    This modifies the messages list in place.
+
+    Args:
+        messages: List of ChatMessage objects to process
+    """
+    i = 0
+    while i < len(messages):
+        msg = messages[i]
+        role_value = get_role_value(msg)
+
+        if role_value != "user":
+            i += 1
+            continue
+
+        # Check if this user message has function_result content
+        function_results: list[Content] = []
+        other_contents: list[Any] = []
+
+        for content in msg.contents or []:
+            if getattr(content, "type", None) == "function_result":
+                function_results.append(content)
+            else:
+                other_contents.append(content)
+
+        if not function_results:
+            i += 1
+            continue
+
+        # We have function results in a user message - need to fix this
+        logger.info(
+            f"Converting {len(function_results)} function_result content(s) from user message to tool message(s)"
+        )
+
+        # Create tool messages for each function result
+        new_tool_messages = []
+        for func_result in function_results:
+            tool_msg = ChatMessage(
+                role="tool",
+                contents=[func_result],
+            )
+            new_tool_messages.append(tool_msg)
+
+        if other_contents:
+            # Keep the user message with remaining contents
+            msg.contents = other_contents
+            # Insert tool messages after this user message
+            for j, tool_msg in enumerate(new_tool_messages):
+                messages.insert(i + 1 + j, tool_msg)
+            i += 1 + len(new_tool_messages)
+        else:
+            # No other contents - replace user message with tool messages
+            messages.pop(i)
+            for j, tool_msg in enumerate(new_tool_messages):
+                messages.insert(i + j, tool_msg)
+            i += len(new_tool_messages)
+
 
 def _build_messages_snapshot(
     flow: FlowState,
@@ -630,25 +707,29 @@ def _build_messages_snapshot(
     """Build MessagesSnapshotEvent from current flow state."""
     all_messages = list(snapshot_messages)
 
-    # Add assistant message with tool calls
+    # Add assistant message with tool calls only (no content)
     if flow.pending_tool_calls:
         tool_call_message = {
             "id": flow.message_id or generate_event_id(),
             "role": "assistant",
             "tool_calls": flow.pending_tool_calls.copy(),
         }
-        if flow.accumulated_text:
-            tool_call_message["content"] = flow.accumulated_text
         all_messages.append(tool_call_message)
 
     # Add tool results
     all_messages.extend(flow.tool_results)
 
-    # Add text-only assistant message if no tool calls
-    if flow.accumulated_text and not flow.pending_tool_calls:
+    # Add text-only assistant message if there is accumulated text
+    # This is a separate message from the tool calls message to maintain
+    # the expected AG-UI protocol format (see issue #3619)
+    if flow.accumulated_text:
+        # Use a new ID for the content message if we had tool calls (separate message)
+        content_message_id = (
+            generate_event_id() if flow.pending_tool_calls else (flow.message_id or generate_event_id())
+        )
         all_messages.append(
             {
-                "id": flow.message_id or generate_event_id(),
+                "id": content_message_id,
                 "role": "assistant",
                 "content": flow.accumulated_text,
             }
@@ -922,6 +1003,20 @@ async def run_agent_stream(
                                     tool_call_id,
                                 )
 
+                        # Parse function arguments - skip confirm_changes if we can't parse
+                        # (we can't ask user to confirm something we can't properly display)
+                        try:
+                            function_arguments = json.loads(tool_call.get("function", {}).get("arguments", "{}"))
+                        except json.JSONDecodeError:
+                            logger.warning(
+                                "Failed to decode JSON arguments for confirm_changes tool '%s' "
+                                "(tool_call_id=%s). Skipping confirmation flow - cannot display "
+                                "malformed arguments to user for approval.",
+                                tool_name,
+                                tool_call_id,
+                            )
+                            continue  # Skip to next tool call without emitting confirm_changes
+
                         # Emit confirm_changes tool call
                         confirm_id = generate_event_id()
                         yield ToolCallStartEvent(
@@ -932,7 +1027,7 @@ async def run_agent_stream(
                         confirm_args = {
                             "function_name": tool_name,
                             "function_call_id": tool_call_id,
-                            "function_arguments": json.loads(tool_call.get("function", {}).get("arguments", "{}")),
+                            "function_arguments": function_arguments,
                             "steps": [{"description": f"Execute {tool_name}", "status": "enabled"}],
                         }
                         yield ToolCallArgsEvent(tool_call_id=confirm_id, delta=json.dumps(confirm_args))
diff --git a/python/packages/ag-ui/tests/test_message_hygiene.py b/python/packages/ag-ui/tests/test_message_hygiene.py
index ecc01de3cb..eb55965ab9 100644
--- a/python/packages/ag-ui/tests/test_message_hygiene.py
+++ b/python/packages/ag-ui/tests/test_message_hygiene.py
@@ -48,3 +48,138 @@ def test_deduplicate_messages_prefers_non_empty_tool_results() -> None:
     deduped = _deduplicate_messages(messages)
     assert len(deduped) == 1
     assert deduped[0].contents[0].result == "result data"
+
+
+def test_convert_approval_results_to_tool_messages() -> None:
+    """Test that function_result content in user messages gets converted to tool messages.
+
+    This is a regression test for the MCP tool double-call bug where approved tool
+    results ended up in user messages instead of tool messages, causing OpenAI to
+    reject the request with 'tool_call_ids did not have response messages'.
+    """
+    from agent_framework_ag_ui._run import _convert_approval_results_to_tool_messages
+
+    # Simulate what happens after _resolve_approval_responses:
+    # A user message contains function_result content (the executed tool result)
+    messages = [
+        ChatMessage(
+            role="assistant",
+            contents=[
+                Content.from_function_call(call_id="call_123", name="my_mcp_tool", arguments="{}"),
+            ],
+        ),
+        ChatMessage(
+            role="user",
+            contents=[
+                Content.from_function_result(call_id="call_123", result="tool execution result"),
+            ],
+        ),
+    ]
+
+    _convert_approval_results_to_tool_messages(messages)
+
+    # After conversion, the function result should be in a tool message, not user message
+    assert len(messages) == 2
+
+    # First message unchanged
+    assert messages[0].role.value == "assistant"
+
+    # Second message should now be role="tool"
+    assert messages[1].role.value == "tool"
+    assert messages[1].contents[0].type == "function_result"
+    assert messages[1].contents[0].call_id == "call_123"
+
+
+def test_convert_approval_results_preserves_other_user_content() -> None:
+    """Test that user messages with mixed content are handled correctly.
+
+    If a user message has both function_result content and other content (like text),
+    the function_result content should be extracted to a tool message while the
+    remaining content stays in the user message.
+    """
+    from agent_framework_ag_ui._run import _convert_approval_results_to_tool_messages
+
+    messages = [
+        ChatMessage(
+            role="assistant",
+            contents=[
+                Content.from_function_call(call_id="call_123", name="my_tool", arguments="{}"),
+            ],
+        ),
+        ChatMessage(
+            role="user",
+            contents=[
+                Content.from_text(text="User also said something"),
+                Content.from_function_result(call_id="call_123", result="tool result"),
+            ],
+        ),
+    ]
+
+    _convert_approval_results_to_tool_messages(messages)
+
+    # Should have 3 messages now: assistant, user (with text), tool (with result)
+    assert len(messages) == 3
+
+    # First message unchanged
+    assert messages[0].role.value == "assistant"
+
+    # Second message should be user with just text
+    assert messages[1].role.value == "user"
+    assert len(messages[1].contents) == 1
+    assert messages[1].contents[0].type == "text"
+
+    # Third message should be tool with result
+    assert messages[2].role.value == "tool"
+    assert messages[2].contents[0].type == "function_result"
+
+
+def test_sanitize_tool_history_multiple_requests_with_confirm_changes() -> None:
+    """Test that confirm_changes is properly handled across multiple requests.
+
+    This is a regression test for the MCP tool double-call bug. When a second
+    MCP tool call happens after the first one completed, the message history
+    should have proper synthetic results for all confirm_changes calls.
+    """
+    # Simulate history from first request (already completed)
+    # Note: confirm_changes synthetic result might be missing from frontend's snapshot
+    messages = [
+        # First request - user asks something
+        ChatMessage(
+            role="user",
+            contents=[Content.from_text(text="What time is it?")],
+        ),
+        # First request - assistant calls MCP tool + confirm_changes
+        ChatMessage(
+            role="assistant",
+            contents=[
+                Content.from_function_call(call_id="call_1", name="get_datetime", arguments="{}"),
+                Content.from_function_call(call_id="call_c1", name="confirm_changes", arguments="{}"),
+            ],
+        ),
+        # First request - tool result for the actual MCP tool
+        ChatMessage(
+            role="tool",
+            contents=[Content.from_function_result(call_id="call_1", result="2024-01-01 12:00:00")],
+        ),
+        # Note: NO tool result for call_c1 (confirm_changes) - this is the bug scenario!
+        # The synthetic was injected in request 1 but not persisted in snapshot
+        # Second request - user asks something else
+        ChatMessage(
+            role="user",
+            contents=[Content.from_text(text="What's the date?")],
+        ),
+    ]
+
+    sanitized = _sanitize_tool_history(messages)
+
+    # After sanitization, call_c1 should have a synthetic result
+    tool_messages = [
+        msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "tool"
+    ]
+
+    # Should have 2 tool messages: one for call_1 (real) and one for call_c1 (synthetic)
+    assert len(tool_messages) == 2
+
+    tool_call_ids = {str(msg.contents[0].call_id) for msg in tool_messages}
+    assert "call_1" in tool_call_ids
+    assert "call_c1" in tool_call_ids
diff --git a/python/packages/ag-ui/tests/test_run.py b/python/packages/ag-ui/tests/test_run.py
index a415000692..5cfcfe7619 100644
--- a/python/packages/ag-ui/tests/test_run.py
+++ b/python/packages/ag-ui/tests/test_run.py
@@ -353,6 +353,58 @@ def test_emit_tool_call_generates_id():
     assert flow.tool_call_id is not None  # ID should be generated
 
 
+def test_emit_tool_result_closes_open_message():
+    """Test _emit_tool_result emits TextMessageEndEvent for open text message.
+
+    This is a regression test for issue #3568 where TEXT_MESSAGE_END was not
+    emitted when using MCP tools because the message_id was reset without
+    closing the message first.
+    """
+    from ag_ui.core import TextMessageEndEvent
+
+    from agent_framework_ag_ui._run import FlowState, _emit_tool_result
+
+    flow = FlowState()
+    # Simulate an open text message (e.g., from Feature #4 tool-only detection)
+    flow.message_id = "open-msg-123"
+    flow.tool_call_id = "call_456"
+
+    content = Content.from_function_result(call_id="call_456", result="tool result")
+
+    events = _emit_tool_result(content, flow, predictive_handler=None)
+
+    # Should have: ToolCallEndEvent, ToolCallResultEvent, TextMessageEndEvent
+    assert len(events) == 3
+
+    # Verify TextMessageEndEvent is emitted for the open message
+    text_end_events = [e for e in events if isinstance(e, TextMessageEndEvent)]
+    assert len(text_end_events) == 1
+    assert text_end_events[0].message_id == "open-msg-123"
+
+    # Verify message_id is reset after
+    assert flow.message_id is None
+
+
+def test_emit_tool_result_no_open_message():
+    """Test _emit_tool_result works when there's no open text message."""
+    from ag_ui.core import TextMessageEndEvent
+
+    from agent_framework_ag_ui._run import FlowState, _emit_tool_result
+
+    flow = FlowState()
+    # No open message
+    flow.message_id = None
+    flow.tool_call_id = "call_456"
+
+    content = Content.from_function_result(call_id="call_456", result="tool result")
+
+    events = _emit_tool_result(content, flow, predictive_handler=None)
+
+    # Should have: ToolCallEndEvent, ToolCallResultEvent (no TextMessageEndEvent)
+    text_end_events = [e for e in events if isinstance(e, TextMessageEndEvent)]
+    assert len(text_end_events) == 0
+
+
 def test_extract_approved_state_updates_no_handler():
     """Test _extract_approved_state_updates returns empty with no handler."""
     from agent_framework_ag_ui._run import _extract_approved_state_updates
@@ -371,3 +423,143 @@ def test_extract_approved_state_updates_no_approval():
     messages = [ChatMessage(role="user", contents=[Content.from_text("Hello")])]
     result = _extract_approved_state_updates(messages, handler)
     assert result == {}
+
+
+class TestBuildMessagesSnapshot:
+    """Tests for _build_messages_snapshot function."""
+
+    def test_tool_calls_and_text_are_separate_messages(self):
+        """Test that tool calls and text content are emitted as separate messages.
+
+        This is a regression test for issue #3619 where tool calls and content
+        were incorrectly merged into a single assistant message.
+        """
+        from agent_framework_ag_ui._run import FlowState, _build_messages_snapshot
+
+        flow = FlowState()
+        flow.message_id = "msg-123"
+        flow.pending_tool_calls = [
+            {"id": "call_1", "function": {"name": "get_weather", "arguments": '{"city": "NYC"}'}},
+        ]
+        flow.accumulated_text = "Here is the weather information."
+        flow.tool_results = [{"id": "result-1", "role": "tool", "content": '{"temp": 72}', "toolCallId": "call_1"}]
+
+        result = _build_messages_snapshot(flow, [])
+
+        # Should have 3 messages: tool call msg, tool result, text content msg
+        assert len(result.messages) == 3
+
+        # First message: assistant with tool calls only (no content)
+        assistant_tool_msg = result.messages[0]
+        assert assistant_tool_msg.role == "assistant"
+        assert assistant_tool_msg.tool_calls is not None
+        assert len(assistant_tool_msg.tool_calls) == 1
+        assert assistant_tool_msg.content is None
+
+        # Second message: tool result
+        tool_result_msg = result.messages[1]
+        assert tool_result_msg.role == "tool"
+
+        # Third message: assistant with content only (no tool calls)
+        assistant_text_msg = result.messages[2]
+        assert assistant_text_msg.role == "assistant"
+        assert assistant_text_msg.content == "Here is the weather information."
+        assert assistant_text_msg.tool_calls is None
+
+        # The text message should have a different ID than the tool call message
+        assert assistant_text_msg.id != assistant_tool_msg.id
+
+    def test_only_tool_calls_no_text(self):
+        """Test snapshot with only tool calls and no accumulated text."""
+        from agent_framework_ag_ui._run import FlowState, _build_messages_snapshot
+
+        flow = FlowState()
+        flow.message_id = "msg-123"
+        flow.pending_tool_calls = [
+            {"id": "call_1", "function": {"name": "get_weather", "arguments": "{}"}},
+        ]
+        flow.accumulated_text = ""
+        flow.tool_results = []
+
+        result = _build_messages_snapshot(flow, [])
+
+        # Should have 1 message: tool call msg only
+        assert len(result.messages) == 1
+        assert result.messages[0].role == "assistant"
+        assert result.messages[0].tool_calls is not None
+        assert result.messages[0].content is None
+
+    def test_only_text_no_tool_calls(self):
+        """Test snapshot with only text and no tool calls."""
+        from agent_framework_ag_ui._run import FlowState, _build_messages_snapshot
+
+        flow = FlowState()
+        flow.message_id = "msg-123"
+        flow.pending_tool_calls = []
+        flow.accumulated_text = "Hello world"
+        flow.tool_results = []
+
+        result = _build_messages_snapshot(flow, [])
+
+        # Should have 1 message: text content msg only
+        assert len(result.messages) == 1
+        assert result.messages[0].role == "assistant"
+        assert result.messages[0].content == "Hello world"
+        assert result.messages[0].tool_calls is None
+        # Should use the existing message_id
+        assert result.messages[0].id == "msg-123"
+
+    def test_preserves_snapshot_messages(self):
+        """Test that existing snapshot messages are preserved."""
+        from agent_framework_ag_ui._run import FlowState, _build_messages_snapshot
+
+        flow = FlowState()
+        flow.pending_tool_calls = []
+        flow.accumulated_text = ""
+
+        existing_messages = [
+            {"id": "user-1", "role": "user", "content": "Hello"},
+            {"id": "assist-1", "role": "assistant", "content": "Hi there"},
+        ]
+
+        result = _build_messages_snapshot(flow, existing_messages)
+
+        assert len(result.messages) == 2
+        assert result.messages[0].id == "user-1"
+        assert result.messages[1].id == "assist-1"
+
+
+def test_malformed_json_in_confirm_args_skips_confirmation():
+    """Test that malformed JSON in tool arguments skips confirm_changes flow.
+
+    This is a regression test to ensure that when tool arguments contain malformed
+    JSON, the code skips the confirmation flow entirely rather than crashing or
+    showing incomplete data to the user.
+    """
+    import json
+
+    # Simulate the parsing logic - malformed JSON should trigger skip
+    malformed_arguments = "{ invalid json }"
+    tool_call = {"function": {"name": "write_doc", "arguments": malformed_arguments}}
+
+    # This is what the code should do - detect parsing failure and skip
+    should_skip_confirmation = False
+    try:
+        json.loads(tool_call.get("function", {}).get("arguments", "{}"))
+    except json.JSONDecodeError:
+        should_skip_confirmation = True
+
+    # Should skip confirmation when JSON is malformed
+    assert should_skip_confirmation is True
+
+    # Valid JSON should proceed with confirmation
+    valid_arguments = '{"content": "hello"}'
+    tool_call_valid = {"function": {"name": "write_doc", "arguments": valid_arguments}}
+    should_skip_confirmation = False
+    try:
+        function_arguments = json.loads(tool_call_valid.get("function", {}).get("arguments", "{}"))
+    except json.JSONDecodeError:
+        should_skip_confirmation = True
+
+    assert should_skip_confirmation is False
+    assert function_arguments == {"content": "hello"}

From 76c70df5eeb665a27fd81203d6416b638764393c Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Tue, 3 Feb 2026 12:04:01 +0900
Subject: [PATCH 2/6] Fixes

---
 .../_message_adapters.py                      |  12 +-
 .../ag-ui/agent_framework_ag_ui/_run.py       |  28 ++-
 .../agents/human_in_the_loop_agent.py         |  40 +++-
 .../test_mcp_double_call_fix.py               | 184 ++++++++++++++++++
 .../ag-ui/tests/test_message_adapters.py      |  12 +-
 5 files changed, 263 insertions(+), 13 deletions(-)
 create mode 100644 python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py

diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
index cf189b0a4d..f798fadc86 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
@@ -452,11 +452,15 @@ def _filter_modified_args(
                                 merged_args["steps"] = merged_steps
                         state_args = merged_args
 
-                        # Keep the original tool call and AG-UI snapshot in sync with approved args.
-                        updated_args = (
-                            json.dumps(merged_args) if isinstance(matching_func_call.arguments, str) else merged_args
+                        # Update the ChatMessage tool call with only enabled steps (for LLM context).
+                        # The LLM should only see the steps that were actually approved/executed.
+                        updated_args_for_llm = (
+                            json.dumps(filtered_args) if isinstance(matching_func_call.arguments, str) else filtered_args
                         )
-                        matching_func_call.arguments = updated_args
+                        matching_func_call.arguments = updated_args_for_llm
+
+                        # Update raw messages with all steps + status (for MESSAGES_SNAPSHOT display).
+                        # This allows the UI to show which steps were enabled/disabled.
                         _update_tool_call_arguments(messages, str(approval_call_id), merged_args)
                         # Create a new FunctionCallContent with the modified arguments
                         func_call_for_approval = Content.from_function_call(
diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_run.py b/python/packages/ag-ui/agent_framework_ag_ui/_run.py
index e9b7554e17..943869cc48 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui/_run.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui/_run.py
@@ -461,9 +461,21 @@ def _emit_approval_request(
             "function_arguments": make_json_safe(func_call.parse_arguments()) or {},
             "steps": [{"description": f"Execute {func_name}", "status": "enabled"}],
         }
-        events.append(ToolCallArgsEvent(tool_call_id=confirm_id, delta=json.dumps(args)))
+        args_json = json.dumps(args)
+        events.append(ToolCallArgsEvent(tool_call_id=confirm_id, delta=args_json))
         events.append(ToolCallEndEvent(tool_call_id=confirm_id))
 
+        # Track confirm_changes in pending_tool_calls for MessagesSnapshotEvent
+        # The frontend needs to see this in the snapshot to render the confirmation dialog
+        confirm_entry = {
+            "id": confirm_id,
+            "type": "function",
+            "function": {"name": "confirm_changes", "arguments": args_json},
+        }
+        flow.pending_tool_calls.append(confirm_entry)
+        flow.tool_calls_by_id[confirm_id] = confirm_entry
+        flow.tool_calls_ended.add(confirm_id)  # Mark as ended since we emit End event
+
     flow.waiting_for_approval = True
     return events
 
@@ -1030,8 +1042,20 @@ async def run_agent_stream(
                             "function_arguments": function_arguments,
                             "steps": [{"description": f"Execute {tool_name}", "status": "enabled"}],
                         }
-                        yield ToolCallArgsEvent(tool_call_id=confirm_id, delta=json.dumps(confirm_args))
+                        confirm_args_json = json.dumps(confirm_args)
+                        yield ToolCallArgsEvent(tool_call_id=confirm_id, delta=confirm_args_json)
                         yield ToolCallEndEvent(tool_call_id=confirm_id)
+
+                        # Track confirm_changes in pending_tool_calls for MessagesSnapshotEvent
+                        # The frontend needs to see this in the snapshot to render the confirmation dialog
+                        confirm_entry = {
+                            "id": confirm_id,
+                            "type": "function",
+                            "function": {"name": "confirm_changes", "arguments": confirm_args_json},
+                        }
+                        flow.pending_tool_calls.append(confirm_entry)
+                        flow.tool_calls_by_id[confirm_id] = confirm_entry
+                        flow.tool_calls_ended.add(confirm_id)  # Mark as ended since we emit End event
                         flow.waiting_for_approval = True
 
     # Close any open message
diff --git a/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py b/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py
index 368c4e47ed..fde3fb56a7 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py
@@ -1,7 +1,14 @@
 # Copyright (c) Microsoft. All rights reserved.
 
-"""Human-in-the-loop agent demonstrating step customization (Feature 5)."""
+"""Human-in-the-loop agent demonstrating step customization (Feature 5).
 
+This agent also serves as a test case for the MCP tool double-call bug fix.
+To test: call the agent twice with tasks that require approval (e.g., "Build a robot"
+then "Build a house"). Before the fix, the second call would fail with:
+"An assistant message with 'tool_calls' must be followed by tool messages..."
+"""
+
+from datetime import datetime
 from enum import Enum
 from typing import Any
 
@@ -23,6 +30,24 @@ class TaskStep(BaseModel):
     status: StepStatus = Field(default=StepStatus.ENABLED, description="Whether the step is enabled or disabled")
 
 
+# Simple tool for quick testing of the double-call bug fix
+@tool(
+    name="get_current_time",
+    description="Get the current date and time. Requires user approval.",
+    approval_mode="always_require",
+)
+def get_current_time() -> str:
+    """Get the current date and time.
+
+    This is a simple tool for testing the approval flow. Call it multiple times
+    to verify the double-call bug fix works.
+
+    Returns:
+        Current date and time string.
+    """
+    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+
 @tool(
     name="generate_task_steps",
     description="Generate execution steps for a task",
@@ -55,7 +80,13 @@ def human_in_the_loop_agent(chat_client: ChatClientProtocol[Any]) -> ChatAgent[A
     return ChatAgent(
         name="human_in_the_loop_agent",
         instructions="""You are a helpful assistant that can perform any task by breaking it down into steps.
+    You can also tell the user the current time.
+
+    FOR TIME REQUESTS:
+    When the user asks "what time is it?" or similar, call the `get_current_time` function.
+    This is useful for testing the approval flow multiple times quickly.
 
+    FOR TASK PLANNING:
     When asked to perform a task, you MUST call the `generate_task_steps` function with the proper
     number of steps per the request.
 
@@ -76,11 +107,10 @@ def human_in_the_loop_agent(chat_client: ChatClientProtocol[Any]) -> ChatAgent[A
     9. "Calibrate systems"
     10. "Final testing"
 
-    IMPORTANT: When you call generate_task_steps, the user will be shown the steps and asked to approve.
+    IMPORTANT: Both tools require user approval before execution.
     Do NOT output any text along with the function call - just call the function.
-    After the user approves and the function executes, THEN provide a brief acknowledgment like:
-    "The plan has been created with X steps selected."
+    After the user approves and the function executes, provide a brief response with the result.
     """,
         chat_client=chat_client,
-        tools=[generate_task_steps],
+        tools=[get_current_time, generate_task_steps],
     )
diff --git a/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py b/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py
new file mode 100644
index 0000000000..1e3373ce04
--- /dev/null
+++ b/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py
@@ -0,0 +1,184 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Test sample for validating MCP tool double-call bug fix.
+
+This sample demonstrates the fix for the bug where calling an MCP tool (or any tool
+with `approval_mode="always_require"`) twice would fail with:
+
+    "An assistant message with 'tool_calls' must be followed by tool messages
+    responding to each 'tool_call_id'"
+
+The bug was caused by:
+1. Tool results from approved tools ending up in `role="user"` messages instead of `role="tool"`
+2. Tool call IDs not being removed from pending tracking after seeing their results
+
+To test this sample:
+1. Set environment variables:
+   - AZURE_OPENAI_ENDPOINT
+   - AZURE_OPENAI_CHAT_DEPLOYMENT_NAME
+
+2. Run the server:
+   python -m getting_started.test_mcp_double_call_fix
+
+3. Use the AG-UI frontend or curl to test:
+   - First request: "What time is it?" -> Should work, shows approval dialog
+   - Approve the request
+   - Second request: "What's the date?" -> Before fix: ERROR, After fix: Works!
+"""
+
+import logging
+import os
+from datetime import datetime
+
+from agent_framework import ChatAgent, tool
+from agent_framework.ag_ui import AgentFrameworkAgent, add_agent_framework_fastapi_endpoint
+from agent_framework.azure import AzureOpenAIChatClient
+from dotenv import load_dotenv
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+
+load_dotenv()
+
+# Enable detailed logging to see the fix in action
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+# Also enable AG-UI specific logging
+logging.getLogger("agent_framework_ag_ui").setLevel(logging.INFO)
+
+
+# Tool with approval_mode="always_require" - simulates MCP tool behavior
+@tool(
+    name="get_datetime",
+    description="Get the current date and time. Requires user approval before execution.",
+    approval_mode="always_require",
+)
+def get_datetime() -> str:
+    """Get the current date and time.
+
+    This tool requires user approval before execution, similar to MCP tools.
+
+    Returns:
+        The current date and time as a formatted string.
+    """
+    now = datetime.now()
+    result = now.strftime("%Y-%m-%d %H:%M:%S")
+    logger.info(f"get_datetime tool executed, returning: {result}")
+    return result
+
+
+# Another approval-required tool to test multiple different tools
+@tool(
+    name="get_system_info",
+    description="Get system information. Requires user approval before execution.",
+    approval_mode="always_require",
+)
+def get_system_info() -> str:
+    """Get basic system information.
+
+    This tool also requires user approval, to test calling different approval tools.
+
+    Returns:
+        Basic system information.
+    """
+    import platform
+
+    info = f"Python {platform.python_version()} on {platform.system()} {platform.release()}"
+    logger.info(f"get_system_info tool executed, returning: {info}")
+    return info
+
+
+def create_test_agent() -> ChatAgent:
+    """Create the test agent with approval-required tools."""
+    endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
+    deployment_name = os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
+
+    if not endpoint:
+        raise ValueError("AZURE_OPENAI_ENDPOINT environment variable is required")
+    if not deployment_name:
+        raise ValueError("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME environment variable is required")
+
+    return ChatAgent(
+        name="MCPTestAgent",
+        instructions="""You are a helpful assistant that can provide date/time and system information.
+
+When the user asks about time, date, or "what time is it", use the get_datetime tool.
+When the user asks about system info, use the get_system_info tool.
+
+Important: These tools require user approval before execution. The user will see
+an approval dialog and can choose to approve or reject the tool call.
+
+After a tool is approved and executed, provide a brief, friendly response with the result.
+""",
+        chat_client=AzureOpenAIChatClient(
+            endpoint=endpoint,
+            deployment_name=deployment_name,
+        ),
+        tools=[get_datetime, get_system_info],
+    )
+
+
+# Create FastAPI app
+app = FastAPI(
+    title="MCP Double-Call Bug Test",
+    description="Test server for validating the MCP tool double-call bug fix",
+)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Create the agent
+agent = create_test_agent()
+
+# Register the AG-UI endpoint with require_confirmation=True (like the bug report)
+add_agent_framework_fastapi_endpoint(
+    app=app,
+    agent=AgentFrameworkAgent(
+        agent=agent,
+        name="MCPTestAgent",
+        description="Test agent for MCP double-call bug fix validation",
+        require_confirmation=True,
+    ),
+    path="/",
+)
+
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy", "message": "MCP double-call test server is running"}
+
+
+def main():
+    """Run the test server."""
+    import uvicorn
+
+    port = int(os.getenv("PORT", "8889"))
+    host = os.getenv("HOST", "127.0.0.1")
+
+    print("\n" + "=" * 70)
+    print("MCP Double-Call Bug Fix Test Server")
+    print("=" * 70)
+    print(f"\nServer starting on http://{host}:{port}")
+    print("\nTo test the fix:")
+    print("1. Send: 'What time is it?' -> Approve -> Should work")
+    print("2. Send: 'What time is it?' again -> Approve -> Should also work!")
+    print("   (Before the fix, this second call would fail)")
+    print("\nEndpoints:")
+    print(f"  - AG-UI: POST http://{host}:{port}/")
+    print(f"  - Health: GET http://{host}:{port}/health")
+    print("=" * 70 + "\n")
+
+    uvicorn.run(app, host=host, port=port, log_level="info")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/packages/ag-ui/tests/test_message_adapters.py b/python/packages/ag-ui/tests/test_message_adapters.py
index 4f6c3f1d42..15bb817be8 100644
--- a/python/packages/ag-ui/tests/test_message_adapters.py
+++ b/python/packages/ag-ui/tests/test_message_adapters.py
@@ -98,7 +98,14 @@ def test_agui_tool_result_to_agent_framework():
 
 
 def test_agui_tool_approval_updates_tool_call_arguments():
-    """Tool approval updates matching tool call arguments for snapshots and agent context."""
+    """Tool approval updates matching tool call arguments for snapshots and agent context.
+
+    The LLM context (ChatMessage) should contain only enabled steps, so the LLM
+    generates responses based on what was actually approved/executed.
+
+    The raw messages (for MESSAGES_SNAPSHOT) should contain all steps with status,
+    so the UI can show which steps were enabled/disabled.
+    """
     messages_input = [
         {
             "role": "assistant",
@@ -142,13 +149,14 @@ def test_agui_tool_approval_updates_tool_call_arguments():
     assert len(messages) == 2
     assistant_msg = messages[0]
     func_call = next(content for content in assistant_msg.contents if content.type == "function_call")
+    # LLM context should only have enabled steps (what was actually approved)
     assert func_call.arguments == {
         "steps": [
             {"description": "Boil water", "status": "enabled"},
-            {"description": "Brew coffee", "status": "disabled"},
             {"description": "Serve coffee", "status": "enabled"},
         ]
     }
+    # Raw messages (for MESSAGES_SNAPSHOT) should have all steps with status
     assert messages_input[0]["tool_calls"][0]["function"]["arguments"] == {
         "steps": [
             {"description": "Boil water", "status": "enabled"},

From b8a4d71bf869fb8b454f7d578ef15178cb02b65f Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Tue, 3 Feb 2026 14:42:30 +0900
Subject: [PATCH 3/6] Fixes

---
 .../_message_adapters.py                      |  31 ++-
 .../ag-ui/agent_framework_ag_ui/_run.py       |   4 +
 .../test_mcp_double_call_fix.py               | 184 ------------------
 .../ag-ui/tests/test_message_hygiene.py       | 128 +++++++++---
 python/packages/ag-ui/tests/test_run.py       | 141 +++++++++++++-
 5 files changed, 271 insertions(+), 217 deletions(-)
 delete mode 100644 python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py

diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
index f798fadc86..798569d2ca 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
@@ -45,7 +45,32 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]:
                     confirm_changes_call = content
                     break
 
-            sanitized.append(msg)
+            # Filter out confirm_changes from assistant messages before sending to LLM.
+            # confirm_changes is a synthetic tool for the approval UI flow - the LLM shouldn't
+            # see it because it may contain stale function_arguments that confuse the model
+            # (e.g., showing 5 steps when only 2 were approved).
+            # When we filter out confirm_changes, we also remove it from tool_ids and don't
+            # set pending_confirm_changes_id, so no synthetic result is injected for it.
+            # This is required because OpenAI validates that every tool result has a matching
+            # tool call in the previous assistant message.
+            if confirm_changes_call:
+                filtered_contents = [
+                    c for c in (msg.contents or []) if not (c.type == "function_call" and c.name == "confirm_changes")
+                ]
+                if filtered_contents:
+                    # Create a new message without confirm_changes
+                    msg = ChatMessage(role=msg.role, contents=filtered_contents)
+                    sanitized.append(msg)
+                # If no contents left after filtering, don't append anything
+
+                # Remove confirm_changes from tool_ids since we filtered it from the message
+                if confirm_changes_call.call_id:
+                    tool_ids.discard(str(confirm_changes_call.call_id))
+                # Don't set pending_confirm_changes_id - we don't want a synthetic result
+                confirm_changes_call = None
+            else:
+                sanitized.append(msg)
+
             pending_tool_call_ids = tool_ids if tool_ids else None
             pending_confirm_changes_id = (
                 str(confirm_changes_call.call_id) if confirm_changes_call and confirm_changes_call.call_id else None
@@ -455,7 +480,9 @@ def _filter_modified_args(
                         # Update the ChatMessage tool call with only enabled steps (for LLM context).
                         # The LLM should only see the steps that were actually approved/executed.
                         updated_args_for_llm = (
-                            json.dumps(filtered_args) if isinstance(matching_func_call.arguments, str) else filtered_args
+                            json.dumps(filtered_args)
+                            if isinstance(matching_func_call.arguments, str)
+                            else filtered_args
                         )
                         matching_func_call.arguments = updated_args_for_llm
 
diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_run.py b/python/packages/ag-ui/agent_framework_ag_ui/_run.py
index 943869cc48..6457ce3ba8 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui/_run.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui/_run.py
@@ -391,6 +391,7 @@ def _emit_tool_result(
     # This handles the case where a TextMessageStartEvent was emitted for tool-only
     # messages (Feature #4) but needs to be closed before starting a new message
     if flow.message_id:
+        logger.info(f"Closing text message (issue #3568 fix): message_id={flow.message_id}")
         events.append(TextMessageEndEvent(message_id=flow.message_id))
     flow.message_id = None  # Reset so next text content starts a new message
 
@@ -920,6 +921,8 @@ async def run_agent_stream(
 
         # Emit events for each content item
         for content in update.contents:
+            content_type = getattr(content, "type", None)
+            logger.debug(f"Processing content type={content_type}, message_id={flow.message_id}")
             for event in _emit_content(
                 content,
                 flow,
@@ -1060,6 +1063,7 @@ async def run_agent_stream(
 
     # Close any open message
     if flow.message_id:
+        logger.info(f"End of run: closing text message message_id={flow.message_id}")
         yield TextMessageEndEvent(message_id=flow.message_id)
 
     # Emit MessagesSnapshotEvent if we have tool calls or results
diff --git a/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py b/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py
deleted file mode 100644
index 1e3373ce04..0000000000
--- a/python/packages/ag-ui/getting_started/test_mcp_double_call_fix.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright (c) Microsoft. All rights reserved.
-
-"""Test sample for validating MCP tool double-call bug fix.
-
-This sample demonstrates the fix for the bug where calling an MCP tool (or any tool
-with `approval_mode="always_require"`) twice would fail with:
-
-    "An assistant message with 'tool_calls' must be followed by tool messages
-    responding to each 'tool_call_id'"
-
-The bug was caused by:
-1. Tool results from approved tools ending up in `role="user"` messages instead of `role="tool"`
-2. Tool call IDs not being removed from pending tracking after seeing their results
-
-To test this sample:
-1. Set environment variables:
-   - AZURE_OPENAI_ENDPOINT
-   - AZURE_OPENAI_CHAT_DEPLOYMENT_NAME
-
-2. Run the server:
-   python -m getting_started.test_mcp_double_call_fix
-
-3. Use the AG-UI frontend or curl to test:
-   - First request: "What time is it?" -> Should work, shows approval dialog
-   - Approve the request
-   - Second request: "What's the date?" -> Before fix: ERROR, After fix: Works!
-"""
-
-import logging
-import os
-from datetime import datetime
-
-from agent_framework import ChatAgent, tool
-from agent_framework.ag_ui import AgentFrameworkAgent, add_agent_framework_fastapi_endpoint
-from agent_framework.azure import AzureOpenAIChatClient
-from dotenv import load_dotenv
-from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
-
-load_dotenv()
-
-# Enable detailed logging to see the fix in action
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-)
-logger = logging.getLogger(__name__)
-
-# Also enable AG-UI specific logging
-logging.getLogger("agent_framework_ag_ui").setLevel(logging.INFO)
-
-
-# Tool with approval_mode="always_require" - simulates MCP tool behavior
-@tool(
-    name="get_datetime",
-    description="Get the current date and time. Requires user approval before execution.",
-    approval_mode="always_require",
-)
-def get_datetime() -> str:
-    """Get the current date and time.
-
-    This tool requires user approval before execution, similar to MCP tools.
-
-    Returns:
-        The current date and time as a formatted string.
-    """
-    now = datetime.now()
-    result = now.strftime("%Y-%m-%d %H:%M:%S")
-    logger.info(f"get_datetime tool executed, returning: {result}")
-    return result
-
-
-# Another approval-required tool to test multiple different tools
-@tool(
-    name="get_system_info",
-    description="Get system information. Requires user approval before execution.",
-    approval_mode="always_require",
-)
-def get_system_info() -> str:
-    """Get basic system information.
-
-    This tool also requires user approval, to test calling different approval tools.
-
-    Returns:
-        Basic system information.
-    """
-    import platform
-
-    info = f"Python {platform.python_version()} on {platform.system()} {platform.release()}"
-    logger.info(f"get_system_info tool executed, returning: {info}")
-    return info
-
-
-def create_test_agent() -> ChatAgent:
-    """Create the test agent with approval-required tools."""
-    endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
-    deployment_name = os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
-
-    if not endpoint:
-        raise ValueError("AZURE_OPENAI_ENDPOINT environment variable is required")
-    if not deployment_name:
-        raise ValueError("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME environment variable is required")
-
-    return ChatAgent(
-        name="MCPTestAgent",
-        instructions="""You are a helpful assistant that can provide date/time and system information.
-
-When the user asks about time, date, or "what time is it", use the get_datetime tool.
-When the user asks about system info, use the get_system_info tool.
-
-Important: These tools require user approval before execution. The user will see
-an approval dialog and can choose to approve or reject the tool call.
-
-After a tool is approved and executed, provide a brief, friendly response with the result.
-""",
-        chat_client=AzureOpenAIChatClient(
-            endpoint=endpoint,
-            deployment_name=deployment_name,
-        ),
-        tools=[get_datetime, get_system_info],
-    )
-
-
-# Create FastAPI app
-app = FastAPI(
-    title="MCP Double-Call Bug Test",
-    description="Test server for validating the MCP tool double-call bug fix",
-)
-
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-# Create the agent
-agent = create_test_agent()
-
-# Register the AG-UI endpoint with require_confirmation=True (like the bug report)
-add_agent_framework_fastapi_endpoint(
-    app=app,
-    agent=AgentFrameworkAgent(
-        agent=agent,
-        name="MCPTestAgent",
-        description="Test agent for MCP double-call bug fix validation",
-        require_confirmation=True,
-    ),
-    path="/",
-)
-
-
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    return {"status": "healthy", "message": "MCP double-call test server is running"}
-
-
-def main():
-    """Run the test server."""
-    import uvicorn
-
-    port = int(os.getenv("PORT", "8889"))
-    host = os.getenv("HOST", "127.0.0.1")
-
-    print("\n" + "=" * 70)
-    print("MCP Double-Call Bug Fix Test Server")
-    print("=" * 70)
-    print(f"\nServer starting on http://{host}:{port}")
-    print("\nTo test the fix:")
-    print("1. Send: 'What time is it?' -> Approve -> Should work")
-    print("2. Send: 'What time is it?' again -> Approve -> Should also work!")
-    print("   (Before the fix, this second call would fail)")
-    print("\nEndpoints:")
-    print(f"  - AG-UI: POST http://{host}:{port}/")
-    print(f"  - Health: GET http://{host}:{port}/health")
-    print("=" * 70 + "\n")
-
-    uvicorn.run(app, host=host, port=port, log_level="info")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/packages/ag-ui/tests/test_message_hygiene.py b/python/packages/ag-ui/tests/test_message_hygiene.py
index eb55965ab9..b2f74e60a7 100644
--- a/python/packages/ag-ui/tests/test_message_hygiene.py
+++ b/python/packages/ag-ui/tests/test_message_hygiene.py
@@ -5,7 +5,13 @@
 from agent_framework_ag_ui._message_adapters import _deduplicate_messages, _sanitize_tool_history
 
 
-def test_sanitize_tool_history_injects_confirm_changes_result() -> None:
+def test_sanitize_tool_history_filters_out_confirm_changes_only_message() -> None:
+    """Test that assistant messages with ONLY confirm_changes are filtered out entirely.
+
+    When an assistant message contains only a confirm_changes tool call (no other tools),
+    the entire message should be filtered out because confirm_changes is a synthetic
+    tool for the approval UI flow that shouldn't be sent to the LLM.
+    """
     messages = [
         ChatMessage(
             role="assistant",
@@ -25,12 +31,17 @@ def test_sanitize_tool_history_injects_confirm_changes_result() -> None:
 
     sanitized = _sanitize_tool_history(messages)
 
+    # Assistant message with only confirm_changes should be filtered out
+    assistant_messages = [
+        msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "assistant"
+    ]
+    assert len(assistant_messages) == 0
+
+    # No synthetic tool result should be injected since confirm_changes was filtered out
     tool_messages = [
         msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "tool"
     ]
-    assert len(tool_messages) == 1
-    assert str(tool_messages[0].contents[0].call_id) == "call_confirm_123"
-    assert tool_messages[0].contents[0].result == "Confirmed"
+    assert len(tool_messages) == 0
 
 
 def test_deduplicate_messages_prefers_non_empty_tool_results() -> None:
@@ -133,22 +144,20 @@ def test_convert_approval_results_preserves_other_user_content() -> None:
     assert messages[2].contents[0].type == "function_result"
 
 
-def test_sanitize_tool_history_multiple_requests_with_confirm_changes() -> None:
-    """Test that confirm_changes is properly handled across multiple requests.
+def test_sanitize_tool_history_filters_confirm_changes_keeps_other_tools() -> None:
+    """Test that confirm_changes is filtered but other tools are preserved.
 
-    This is a regression test for the MCP tool double-call bug. When a second
-    MCP tool call happens after the first one completed, the message history
-    should have proper synthetic results for all confirm_changes calls.
+    When an assistant message contains both a real tool call and confirm_changes,
+    confirm_changes should be filtered out while the real tool call is kept.
+    No synthetic result is injected for confirm_changes since it's filtered.
     """
-    # Simulate history from first request (already completed)
-    # Note: confirm_changes synthetic result might be missing from frontend's snapshot
     messages = [
-        # First request - user asks something
+        # User asks something
         ChatMessage(
             role="user",
             contents=[Content.from_text(text="What time is it?")],
         ),
-        # First request - assistant calls MCP tool + confirm_changes
+        # Assistant calls MCP tool + confirm_changes
         ChatMessage(
             role="assistant",
             contents=[
@@ -156,14 +165,12 @@ def test_sanitize_tool_history_multiple_requests_with_confirm_changes() -> None:
                 Content.from_function_call(call_id="call_c1", name="confirm_changes", arguments="{}"),
             ],
         ),
-        # First request - tool result for the actual MCP tool
+        # Tool result for the actual MCP tool
         ChatMessage(
             role="tool",
             contents=[Content.from_function_result(call_id="call_1", result="2024-01-01 12:00:00")],
         ),
-        # Note: NO tool result for call_c1 (confirm_changes) - this is the bug scenario!
-        # The synthetic was injected in request 1 but not persisted in snapshot
-        # Second request - user asks something else
+        # User asks something else
         ChatMessage(
             role="user",
             contents=[Content.from_text(text="What's the date?")],
@@ -172,14 +179,91 @@ def test_sanitize_tool_history_multiple_requests_with_confirm_changes() -> None:
 
     sanitized = _sanitize_tool_history(messages)
 
-    # After sanitization, call_c1 should have a synthetic result
+    # Find the assistant message
+    assistant_messages = [
+        msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "assistant"
+    ]
+    assert len(assistant_messages) == 1
+
+    # Assistant message should only have get_datetime, not confirm_changes
+    function_call_names = [c.name for c in assistant_messages[0].contents if c.type == "function_call"]
+    assert "get_datetime" in function_call_names
+    assert "confirm_changes" not in function_call_names
+
+    # Only one tool message (for call_1), no synthetic for confirm_changes
     tool_messages = [
         msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "tool"
     ]
+    assert len(tool_messages) == 1
+    assert str(tool_messages[0].contents[0].call_id) == "call_1"
+
 
-    # Should have 2 tool messages: one for call_1 (real) and one for call_c1 (synthetic)
-    assert len(tool_messages) == 2
+def test_sanitize_tool_history_filters_confirm_changes_from_assistant_messages() -> None:
+    """Test that confirm_changes is removed from assistant messages sent to LLM.
 
+    This is a regression test for the human-in-the-loop bug where the LLM would see
+    confirm_changes with function_arguments containing the original steps (e.g., 5 steps)
+    even when the user only approved a subset (e.g., 2 steps), causing the LLM to
+    respond with "Here's your 5-step plan" instead of "Here's your 2-step plan".
+    """
+    messages = [
+        ChatMessage(
+            role="user",
+            contents=[Content.from_text(text="Build a robot")],
+        ),
+        # Assistant message with both generate_task_steps and confirm_changes
+        ChatMessage(
+            role="assistant",
+            contents=[
+                Content.from_function_call(
+                    call_id="call_1",
+                    name="generate_task_steps",
+                    arguments='{"steps": [{"description": "Step 1"}, {"description": "Step 2"}]}',
+                ),
+                Content.from_function_call(
+                    call_id="call_c1",
+                    name="confirm_changes",
+                    arguments='{"function_arguments": {"steps": [{"description": "Step 1"}, {"description": "Step 2"}]}}',
+                ),
+            ],
+        ),
+        # Approval response
+        ChatMessage(
+            role="user",
+            contents=[
+                Content.from_function_approval_response(
+                    approved=True,
+                    id="call_1",
+                    function_call=Content.from_function_call(
+                        call_id="call_1",
+                        name="generate_task_steps",
+                        arguments='{"steps": [{"description": "Step 1"}]}',  # Only 1 step approved
+                    ),
+                ),
+            ],
+        ),
+    ]
+
+    sanitized = _sanitize_tool_history(messages)
+
+    # Find the assistant message in sanitized output
+    assistant_messages = [
+        msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "assistant"
+    ]
+
+    assert len(assistant_messages) == 1
+
+    # The assistant message should NOT contain confirm_changes
+    assistant_contents = assistant_messages[0].contents or []
+    function_call_names = [c.name for c in assistant_contents if c.type == "function_call"]
+    assert "generate_task_steps" in function_call_names
+    assert "confirm_changes" not in function_call_names
+
+    # No synthetic tool result for confirm_changes (it was filtered from the message)
+    tool_messages = [
+        msg for msg in sanitized if (msg.role.value if hasattr(msg.role, "value") else str(msg.role)) == "tool"
+    ]
+    # No tool results expected since there are no completed tool calls
+    # (the approval response is handled separately by the framework)
     tool_call_ids = {str(msg.contents[0].call_id) for msg in tool_messages}
-    assert "call_1" in tool_call_ids
-    assert "call_c1" in tool_call_ids
+    assert "call_c1" not in tool_call_ids  # No synthetic result for confirm_changes
diff --git a/python/packages/ag-ui/tests/test_run.py b/python/packages/ag-ui/tests/test_run.py
index 5cfcfe7619..d5198f8dc6 100644
--- a/python/packages/ag-ui/tests/test_run.py
+++ b/python/packages/ag-ui/tests/test_run.py
@@ -2,12 +2,18 @@
 
 """Tests for _run.py helper functions and FlowState."""
 
+from ag_ui.core import (
+    TextMessageEndEvent,
+    TextMessageStartEvent,
+)
 from agent_framework import ChatMessage, Content
 
 from agent_framework_ag_ui._run import (
     FlowState,
     _build_safe_metadata,
     _create_state_context_message,
+    _emit_content,
+    _emit_tool_result,
     _has_only_tool_calls,
     _inject_state_context,
     _should_suppress_intermediate_snapshot,
@@ -356,14 +362,10 @@ def test_emit_tool_call_generates_id():
 def test_emit_tool_result_closes_open_message():
     """Test _emit_tool_result emits TextMessageEndEvent for open text message.
 
-    This is a regression test for issue #3568 where TEXT_MESSAGE_END was not
+    This is a regression test for where TEXT_MESSAGE_END was not
     emitted when using MCP tools because the message_id was reset without
     closing the message first.
     """
-    from ag_ui.core import TextMessageEndEvent
-
-    from agent_framework_ag_ui._run import FlowState, _emit_tool_result
-
     flow = FlowState()
     # Simulate an open text message (e.g., from Feature #4 tool-only detection)
     flow.message_id = "open-msg-123"
@@ -387,10 +389,6 @@ def test_emit_tool_result_closes_open_message():
 
 def test_emit_tool_result_no_open_message():
     """Test _emit_tool_result works when there's no open text message."""
-    from ag_ui.core import TextMessageEndEvent
-
-    from agent_framework_ag_ui._run import FlowState, _emit_tool_result
-
     flow = FlowState()
     # No open message
     flow.message_id = None
@@ -563,3 +561,128 @@ def test_malformed_json_in_confirm_args_skips_confirmation():
 
     assert should_skip_confirmation is False
     assert function_arguments == {"content": "hello"}
+
+
+class TestTextMessageEventBalancing:
+    """Tests for proper TEXT_MESSAGE_START/END event balancing.
+
+    These tests verify that the streaming flow produces balanced pairs of
+    TextMessageStartEvent and TextMessageEndEvent, especially when tool
+    execution is involved.
+    """
+
+    def test_tool_only_flow_produces_balanced_events(self):
+        """Test that a tool-only response produces balanced TEXT_MESSAGE events.
+
+        This simulates the scenario where the LLM immediately calls a tool
+        without any initial text, then returns text after the tool result.
+        """
+        flow = FlowState()
+        all_events: list = []
+
+        # Step 1: LLM outputs function_call only (no text)
+        func_call_content = Content.from_function_call(
+            call_id="call_weather",
+            name="get_weather",
+            arguments='{"city": "Seattle"}',
+        )
+
+        # Feature #4 check: this should trigger TextMessageStartEvent
+        contents = [func_call_content]
+        if not flow.message_id and _has_only_tool_calls(contents):
+            flow.message_id = "tool-msg-1"
+            all_events.append(TextMessageStartEvent(message_id=flow.message_id, role="assistant"))
+
+        # Emit tool call events
+        all_events.extend(_emit_content(func_call_content, flow))
+
+        # Step 2: Tool executes and returns result
+        func_result_content = Content.from_function_result(
+            call_id="call_weather",
+            result='{"temp": 55, "conditions": "rainy"}',
+        )
+
+        # This should close the text message
+        all_events.extend(_emit_tool_result(func_result_content, flow))
+
+        # Verify message_id was reset
+        assert flow.message_id is None, "message_id should be reset after tool result"
+
+        # Step 3: LLM outputs text response
+        text_content = Content.from_text("The weather in Seattle is 55°F and rainy.")
+
+        # Since message_id is None, _emit_text should create a new one
+        for event in _emit_content(text_content, flow):
+            all_events.append(event)
+
+        # Step 4: End of stream - emit final TextMessageEndEvent
+        if flow.message_id:
+            all_events.append(TextMessageEndEvent(message_id=flow.message_id))
+
+        # Verify event counts
+        start_events = [e for e in all_events if isinstance(e, TextMessageStartEvent)]
+        end_events = [e for e in all_events if isinstance(e, TextMessageEndEvent)]
+
+        # Should have 2 TextMessageStartEvent and 2 TextMessageEndEvent
+        assert len(start_events) == 2, f"Expected 2 start events, got {len(start_events)}"
+        assert len(end_events) == 2, f"Expected 2 end events, got {len(end_events)}"
+
+        # Verify order: first message should start and end before second starts
+        # Find indices
+        start_indices = [i for i, e in enumerate(all_events) if isinstance(e, TextMessageStartEvent)]
+        end_indices = [i for i, e in enumerate(all_events) if isinstance(e, TextMessageEndEvent)]
+
+        # First end should come before second start
+        assert end_indices[0] < start_indices[1], (
+            f"First TextMessageEndEvent (index {end_indices[0]}) should come "
+            f"before second TextMessageStartEvent (index {start_indices[1]})"
+        )
+
+    def test_text_then_tool_flow(self):
+        """Test flow where LLM outputs text first, then calls a tool.
+
+        This simulates: "Let me check the weather..." -> tool call -> tool result -> "The weather is..."
+        """
+        flow = FlowState()
+        all_events: list = []
+
+        # Step 1: LLM outputs text first
+        text1 = Content.from_text("Let me check the weather for you.")
+        all_events.extend(_emit_content(text1, flow))
+
+        # Verify message_id is set
+        assert flow.message_id is not None, "message_id should be set after text"
+        first_msg_id = flow.message_id
+
+        # Step 2: LLM outputs function_call
+        func_call = Content.from_function_call(
+            call_id="call_1",
+            name="get_weather",
+            arguments="{}",
+        )
+        all_events.extend(_emit_content(func_call, flow))
+
+        # Step 3: Tool result comes back
+        func_result = Content.from_function_result(call_id="call_1", result="sunny")
+        all_events.extend(_emit_tool_result(func_result, flow))
+
+        # Verify message_id was reset and first message was closed
+        assert flow.message_id is None
+        end_events_so_far = [e for e in all_events if isinstance(e, TextMessageEndEvent)]
+        assert len(end_events_so_far) == 1
+        assert end_events_so_far[0].message_id == first_msg_id
+
+        # Step 4: LLM outputs follow-up text
+        text2 = Content.from_text("The weather is sunny!")
+        all_events.extend(_emit_content(text2, flow))
+
+        # Step 5: End of stream
+        if flow.message_id:
+            all_events.append(TextMessageEndEvent(message_id=flow.message_id))
+
+        # Verify balance
+        start_events = [e for e in all_events if isinstance(e, TextMessageStartEvent)]
+        end_events = [e for e in all_events if isinstance(e, TextMessageEndEvent)]
+
+        assert len(start_events) == 2
+        assert len(end_events) == 2

From a8e11ad16df41bf4d704463a1233b68564913d4f Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Tue, 3 Feb 2026 14:43:56 +0900
Subject: [PATCH 4/6] Revert human_in_the_loop_agent.py changes

---
 .../agents/human_in_the_loop_agent.py         | 40 +++----------------
 1 file changed, 5 insertions(+), 35 deletions(-)

diff --git a/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py b/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py
index fde3fb56a7..368c4e47ed 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui_examples/agents/human_in_the_loop_agent.py
@@ -1,14 +1,7 @@
 # Copyright (c) Microsoft. All rights reserved.
 
-"""Human-in-the-loop agent demonstrating step customization (Feature 5).
+"""Human-in-the-loop agent demonstrating step customization (Feature 5)."""
 
-This agent also serves as a test case for the MCP tool double-call bug fix.
-To test: call the agent twice with tasks that require approval (e.g., "Build a robot"
-then "Build a house"). Before the fix, the second call would fail with:
-"An assistant message with 'tool_calls' must be followed by tool messages..."
-"""
-
-from datetime import datetime
 from enum import Enum
 from typing import Any
 
@@ -30,24 +23,6 @@ class TaskStep(BaseModel):
     status: StepStatus = Field(default=StepStatus.ENABLED, description="Whether the step is enabled or disabled")
 
 
-# Simple tool for quick testing of the double-call bug fix
-@tool(
-    name="get_current_time",
-    description="Get the current date and time. Requires user approval.",
-    approval_mode="always_require",
-)
-def get_current_time() -> str:
-    """Get the current date and time.
-
-    This is a simple tool for testing the approval flow. Call it multiple times
-    to verify the double-call bug fix works.
-
-    Returns:
-        Current date and time string.
-    """
-    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-
-
 @tool(
     name="generate_task_steps",
     description="Generate execution steps for a task",
@@ -80,13 +55,7 @@ def human_in_the_loop_agent(chat_client: ChatClientProtocol[Any]) -> ChatAgent[A
     return ChatAgent(
         name="human_in_the_loop_agent",
         instructions="""You are a helpful assistant that can perform any task by breaking it down into steps.
-    You can also tell the user the current time.
-
-    FOR TIME REQUESTS:
-    When the user asks "what time is it?" or similar, call the `get_current_time` function.
-    This is useful for testing the approval flow multiple times quickly.
 
-    FOR TASK PLANNING:
     When asked to perform a task, you MUST call the `generate_task_steps` function with the proper
     number of steps per the request.
 
@@ -107,10 +76,11 @@ def human_in_the_loop_agent(chat_client: ChatClientProtocol[Any]) -> ChatAgent[A
     9. "Calibrate systems"
     10. "Final testing"
 
-    IMPORTANT: Both tools require user approval before execution.
+    IMPORTANT: When you call generate_task_steps, the user will be shown the steps and asked to approve.
     Do NOT output any text along with the function call - just call the function.
-    After the user approves and the function executes, provide a brief response with the result.
+    After the user approves and the function executes, THEN provide a brief acknowledgment like:
+    "The plan has been created with X steps selected."
     """,
         chat_client=chat_client,
-        tools=[get_current_time, generate_task_steps],
+        tools=[generate_task_steps],
     )

From ba82b672ed91012023d535022ecb3a3ca5721dcf Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Tue, 3 Feb 2026 15:27:31 +0900
Subject: [PATCH 5/6] Address copilot feedback

---
 .../packages/ag-ui/agent_framework_ag_ui/_message_adapters.py | 4 ++--
 python/packages/ag-ui/agent_framework_ag_ui/_run.py           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
index 798569d2ca..0187de012e 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
@@ -58,8 +58,8 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]:
                     c for c in (msg.contents or []) if not (c.type == "function_call" and c.name == "confirm_changes")
                 ]
                 if filtered_contents:
-                    # Create a new message without confirm_changes
-                    msg = ChatMessage(role=msg.role, contents=filtered_contents)
+                    # Update the existing message without confirm_changes, preserving metadata
+                    msg.contents = filtered_contents
                     sanitized.append(msg)
                 # If no contents left after filtering, don't append anything
 
diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_run.py b/python/packages/ag-ui/agent_framework_ag_ui/_run.py
index 6457ce3ba8..3c06880436 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui/_run.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui/_run.py
@@ -391,7 +391,7 @@ def _emit_tool_result(
     # This handles the case where a TextMessageStartEvent was emitted for tool-only
     # messages (Feature #4) but needs to be closed before starting a new message
     if flow.message_id:
-        logger.info(f"Closing text message (issue #3568 fix): message_id={flow.message_id}")
+        logger.debug("Closing text message (issue #3568 fix): message_id=%s", flow.message_id)
         events.append(TextMessageEndEvent(message_id=flow.message_id))
     flow.message_id = None  # Reset so next text content starts a new message
 

From 47693369ae90afe82118069fb7354068358bf031 Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Wed, 4 Feb 2026 07:44:31 +0900
Subject: [PATCH 6/6] PR feedback addressed

---
 .../_message_adapters.py                      |  8 +-
 .../ag-ui/agent_framework_ag_ui/_run.py       | 98 ++++++++-----------
 .../ag-ui/tests/test_message_hygiene.py       | 17 ++--
 3 files changed, 53 insertions(+), 70 deletions(-)

diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
index 0187de012e..79a07edd02 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui/_message_adapters.py
@@ -58,9 +58,9 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]:
                     c for c in (msg.contents or []) if not (c.type == "function_call" and c.name == "confirm_changes")
                 ]
                 if filtered_contents:
-                    # Update the existing message without confirm_changes, preserving metadata
-                    msg.contents = filtered_contents
-                    sanitized.append(msg)
+                    # Create a new message without confirm_changes to avoid mutating the input
+                    filtered_msg = ChatMessage(role=msg.role, contents=filtered_contents)
+                    sanitized.append(filtered_msg)
                 # If no contents left after filtering, don't append anything
 
                 # Remove confirm_changes from tool_ids since we filtered it from the message
@@ -119,6 +119,8 @@ def _sanitize_tool_history(messages: list[ChatMessage]) -> list[ChatMessage]:
                         user_text = content.text  # type: ignore[assignment]
                         break
 
+                if not user_text:
+                    continue
                 try:
                     parsed = json.loads(user_text)  # type: ignore[arg-type]
                     if "accepted" in parsed:
diff --git a/python/packages/ag-ui/agent_framework_ag_ui/_run.py b/python/packages/ag-ui/agent_framework_ag_ui/_run.py
index 3c06880436..1195d3b42f 100644
--- a/python/packages/ag-ui/agent_framework_ag_ui/_run.py
+++ b/python/packages/ag-ui/agent_framework_ag_ui/_run.py
@@ -516,7 +516,7 @@ def _is_confirm_changes_response(messages: list[Any]) -> bool:
 
     # Parse the content to check if it has the confirm_changes structure
     for content in last.contents:
-        if getattr(content, "type", None) == "text":
+        if getattr(content, "type", None) == "text" and content.text:
             try:
                 result = json.loads(content.text)
                 # confirm_changes results have 'accepted' and 'steps' keys
@@ -536,31 +536,34 @@ def _handle_step_based_approval(messages: list[Any]) -> list[BaseEvent]:
     # Parse the approval content
     approval_text = ""
     for content in last.contents:
-        if getattr(content, "type", None) == "text":
+        if getattr(content, "type", None) == "text" and content.text:
             approval_text = content.text
             break
 
-    try:
-        result = json.loads(approval_text)
-        accepted = result.get("accepted", False)
-        steps = result.get("steps", [])
-
-        if accepted:
-            # Generate acceptance message with step descriptions
-            enabled_steps = [s for s in steps if s.get("status") == "enabled"]
-            if enabled_steps:
-                message_parts = [f"Executing {len(enabled_steps)} approved steps:\n\n"]
-                for i, step in enumerate(enabled_steps, 1):
-                    message_parts.append(f"{i}. {step.get('description', 'Step')}\n")
-                message_parts.append("\nAll steps completed successfully!")
-                message = "".join(message_parts)
-            else:
-                message = "Changes confirmed and applied successfully!"
-        else:
-            # Rejection message
-            message = "No problem! What would you like me to change about the plan?"
-    except json.JSONDecodeError:
+    if not approval_text:
         message = "Acknowledged."
+    else:
+        try:
+            result = json.loads(approval_text)
+            accepted = result.get("accepted", False)
+            steps = result.get("steps", [])
+
+            if accepted:
+                # Generate acceptance message with step descriptions
+                enabled_steps = [s for s in steps if s.get("status") == "enabled"]
+                if enabled_steps:
+                    message_parts = [f"Executing {len(enabled_steps)} approved steps:\n\n"]
+                    for i, step in enumerate(enabled_steps, 1):
+                        message_parts.append(f"{i}. {step.get('description', 'Step')}\n")
+                    message_parts.append("\nAll steps completed successfully!")
+                    message = "".join(message_parts)
+                else:
+                    message = "Changes confirmed and applied successfully!"
+            else:
+                # Rejection message
+                message = "No problem! What would you like me to change about the plan?"
+        except json.JSONDecodeError:
+            message = "Acknowledged."
 
     message_id = generate_event_id()
     events.append(TextMessageStartEvent(message_id=message_id, role="assistant"))
@@ -661,56 +664,33 @@ def _convert_approval_results_to_tool_messages(messages: list[Any]) -> None:
     Args:
         messages: List of ChatMessage objects to process
     """
-    i = 0
-    while i < len(messages):
-        msg = messages[i]
-        role_value = get_role_value(msg)
+    result: list[Any] = []
 
-        if role_value != "user":
-            i += 1
+    for msg in messages:
+        if get_role_value(msg) != "user":
+            result.append(msg)
             continue
 
-        # Check if this user message has function_result content
-        function_results: list[Content] = []
-        other_contents: list[Any] = []
-
-        for content in msg.contents or []:
-            if getattr(content, "type", None) == "function_result":
-                function_results.append(content)
-            else:
-                other_contents.append(content)
+        function_results = [c for c in (msg.contents or []) if getattr(c, "type", None) == "function_result"]
+        other_contents = [c for c in (msg.contents or []) if getattr(c, "type", None) != "function_result"]
 
         if not function_results:
-            i += 1
+            result.append(msg)
             continue
 
-        # We have function results in a user message - need to fix this
         logger.info(
             f"Converting {len(function_results)} function_result content(s) from user message to tool message(s)"
         )
 
-        # Create tool messages for each function result
-        new_tool_messages = []
+        # Tool messages first (right after the preceding assistant message per OpenAI requirements)
         for func_result in function_results:
-            tool_msg = ChatMessage(
-                role="tool",
-                contents=[func_result],
-            )
-            new_tool_messages.append(tool_msg)
+            result.append(ChatMessage(role="tool", contents=[func_result]))
 
+        # Then user message with remaining content (if any)
         if other_contents:
-            # Keep the user message with remaining contents
-            msg.contents = other_contents
-            # Insert tool messages after this user message
-            for j, tool_msg in enumerate(new_tool_messages):
-                messages.insert(i + 1 + j, tool_msg)
-            i += 1 + len(new_tool_messages)
-        else:
-            # No other contents - replace user message with tool messages
-            messages.pop(i)
-            for j, tool_msg in enumerate(new_tool_messages):
-                messages.insert(i + j, tool_msg)
-            i += len(new_tool_messages)
+            result.append(ChatMessage(role=msg.role, contents=other_contents))
+
+    messages[:] = result
 
 
 def _build_messages_snapshot(
@@ -1063,7 +1043,7 @@ async def run_agent_stream(
 
     # Close any open message
     if flow.message_id:
-        logger.info(f"End of run: closing text message message_id={flow.message_id}")
+        logger.debug(f"End of run: closing text message message_id={flow.message_id}")
         yield TextMessageEndEvent(message_id=flow.message_id)
 
     # Emit MessagesSnapshotEvent if we have tool calls or results
diff --git a/python/packages/ag-ui/tests/test_message_hygiene.py b/python/packages/ag-ui/tests/test_message_hygiene.py
index b2f74e60a7..70567d00ab 100644
--- a/python/packages/ag-ui/tests/test_message_hygiene.py
+++ b/python/packages/ag-ui/tests/test_message_hygiene.py
@@ -128,20 +128,21 @@ def test_convert_approval_results_preserves_other_user_content() -> None:
 
     _convert_approval_results_to_tool_messages(messages)
 
-    # Should have 3 messages now: assistant, user (with text), tool (with result)
+    # Should have 3 messages now: assistant, tool (with result), user (with text)
+    # OpenAI requires tool messages immediately after the assistant message with the tool call
     assert len(messages) == 3
 
     # First message unchanged
     assert messages[0].role.value == "assistant"
 
-    # Second message should be user with just text
-    assert messages[1].role.value == "user"
-    assert len(messages[1].contents) == 1
-    assert messages[1].contents[0].type == "text"
+    # Second message should be tool with result (must come right after assistant per OpenAI requirements)
+    assert messages[1].role.value == "tool"
+    assert messages[1].contents[0].type == "function_result"
 
-    # Third message should be tool with result
-    assert messages[2].role.value == "tool"
-    assert messages[2].contents[0].type == "function_result"
+    # Third message should be user with just text
+    assert messages[2].role.value == "user"
+    assert len(messages[2].contents) == 1
+    assert messages[2].contents[0].type == "text"
 
 
 def test_sanitize_tool_history_filters_confirm_changes_keeps_other_tools() -> None: