From 3c3206f2bea1c2ea3680e1cfba36155de4025de0 Mon Sep 17 00:00:00 2001 From: alliscode Date: Wed, 18 Feb 2026 15:55:47 -0800 Subject: [PATCH 1/2] Fix Responses API handoff state handling and add focused tests --- .../_handoff.py | 13 + .../tests/test_handoff_responses.py | 243 ++++++++++++++++++ 2 files changed, 256 insertions(+) create mode 100644 python/packages/orchestrations/tests/test_handoff_responses.py diff --git a/python/packages/orchestrations/agent_framework_orchestrations/_handoff.py b/python/packages/orchestrations/agent_framework_orchestrations/_handoff.py index e0dc3e8ea9..efa5caa52f 100644 --- a/python/packages/orchestrations/agent_framework_orchestrations/_handoff.py +++ b/python/packages/orchestrations/agent_framework_orchestrations/_handoff.py @@ -375,6 +375,12 @@ async def _run_agent_and_emit( if await self._check_terminate_and_yield(cast(WorkflowContext[Never, list[Message]], ctx)): return + # Use full conversation history as agent input so the agent sees all prior + # turns, not just the latest broadcast. This is critical for APIs like the + # Responses API where clearing service_session_id (on handoff) means the + # server no longer carries implicit context via previous_response_id. + self._cache = list(self._full_conversation) + # Run the agent if ctx.is_streaming(): # Streaming mode: emit incremental updates @@ -409,6 +415,13 @@ async def _run_agent_and_emit( f"target '{handoff_target}'. Valid targets are: {', '.join(self._handoff_targets)}" ) + # Clear the session's service_session_id to prevent stale previous_response_id + # from being sent on the next run. The handoff response contained a function_call + # for the handoff tool; referencing it via previous_response_id after the tool + # output has been cleaned would cause "No tool output found" API errors. + if self._session and self._session.service_session_id: + self._session.service_session_id = None + await cast(WorkflowContext[AgentExecutorRequest], ctx).send_message( AgentExecutorRequest(messages=[], should_respond=True), target_id=handoff_target ) diff --git a/python/packages/orchestrations/tests/test_handoff_responses.py b/python/packages/orchestrations/tests/test_handoff_responses.py new file mode 100644 index 0000000000..ba809b48f4 --- /dev/null +++ b/python/packages/orchestrations/tests/test_handoff_responses.py @@ -0,0 +1,243 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Regression tests for HandoffBuilder with Responses API style clients. + +These tests cover two handoff invariants: +1. A handoff should clear the service conversation pointer so stale response IDs are not reused. +2. A resumed agent should receive full conversation context, including the original user prompt. +""" + +from collections.abc import AsyncIterable, Awaitable, Mapping, Sequence +from typing import Any, cast + +from agent_framework import ( + Agent, + ChatResponse, + ChatResponseUpdate, + Content, + Message, + ResponseStream, +) +from agent_framework._clients import BaseChatClient +from agent_framework._middleware import ChatMiddlewareLayer +from agent_framework._tools import FunctionInvocationLayer +from agent_framework.orchestrations import HandoffBuilder + + +class ResponsesApiMockClient(ChatMiddlewareLayer[Any], FunctionInvocationLayer[Any], BaseChatClient[Any]): + """Mock client that simulates AzureOpenAIResponsesClient behavior. + + Key differences from the standard MockChatClient in test_handoff.py: + - Sets conversation_id on responses (like resp_XXX), which causes + session.service_session_id to be updated after each agent run. + - Sets STORES_BY_DEFAULT = True to prevent InMemoryHistoryProvider auto-injection, + matching the real AzureOpenAIResponsesClient behavior. + - Tracks all received messages and conversation IDs for assertions. + """ + + # Prevent InMemoryHistoryProvider from being auto-injected. + # The real AzureOpenAIResponsesClient uses server-side storage (via previous_response_id), + # so InMemoryHistoryProvider is not needed and would hide context-handling regressions. + STORES_BY_DEFAULT = True + + def __init__( + self, + *, + name: str = "", + handoff_to: str | None = None, + ) -> None: + ChatMiddlewareLayer.__init__(self) + FunctionInvocationLayer.__init__(self) + BaseChatClient.__init__(self) + self._name = name + self._handoff_to = handoff_to + self._call_index = 0 + self._response_counter = 0 + # Track messages received on each call for context assertions. + self.received_messages_per_call: list[list[Message]] = [] + # Track conversation_id received on each call for stale-id assertions. + self.received_conversation_ids: list[str | None] = [] + + def _inner_get_response( + self, + *, + messages: Sequence[Message], + stream: bool, + options: Mapping[str, Any], + **kwargs: Any, + ) -> Awaitable[ChatResponse] | ResponseStream[ChatResponseUpdate, ChatResponse]: + # Record messages and conversation_id for assertions. + self.received_messages_per_call.append(list(messages)) + self.received_conversation_ids.append(options.get("conversation_id") if options else None) + + self._response_counter += 1 + resp_id = f"resp_{self._name}_{self._response_counter}" + + if stream: + return self._build_streaming_response(options=dict(options), resp_id=resp_id) + + async def _get() -> ChatResponse: + contents = self._build_reply_contents() + reply = Message(role="assistant", contents=contents) + # Simulate Responses API: set conversation_id to resp_XXX + return ChatResponse( + messages=reply, + response_id=resp_id, + conversation_id=resp_id, + ) + + return _get() + + def _build_streaming_response( + self, *, options: dict[str, Any], resp_id: str + ) -> ResponseStream[ChatResponseUpdate, ChatResponse]: + async def _stream() -> AsyncIterable[ChatResponseUpdate]: + contents = self._build_reply_contents() + yield ChatResponseUpdate(contents=contents, role="assistant", finish_reason="stop") + + def _finalize(updates: Sequence[ChatResponseUpdate]) -> ChatResponse: + response_format = options.get("response_format") + output_format_type = response_format if isinstance(response_format, type) else None + resp = ChatResponse.from_updates(updates, output_format_type=output_format_type) + # Simulate Responses API: set conversation_id + resp.conversation_id = resp_id + return resp + + return ResponseStream(_stream(), finalizer=_finalize) + + def _build_reply_contents(self) -> list[Content]: + contents: list[Content] = [] + if self._handoff_to and self._call_index == 0: + # Only handoff on first call + call_id = f"{self._name}-handoff-{self._call_index}" + self._call_index += 1 + contents.append( + Content.from_function_call( + call_id=call_id, + name=f"handoff_to_{self._handoff_to}", + arguments={"handoff_to": self._handoff_to}, + ) + ) + text = f"{self._name} reply (call {self._call_index})" + self._call_index += 1 + contents.append(Content.from_text(text=text)) + return contents + + +class ResponsesApiMockAgent(Agent): + """Mock agent that simulates Responses API behavior for handoff testing.""" + + def __init__(self, *, name: str, handoff_to: str | None = None) -> None: + client = ResponsesApiMockClient(name=name, handoff_to=handoff_to) + super().__init__(client=client, name=name, id=name) + + +async def test_handoff_clears_stale_conversation_id_before_resume(): + """A resumed agent should not receive a stale conversation_id after handoff.""" + coordinator = ResponsesApiMockAgent(name="coordinator", handoff_to="specialist") + specialist = ResponsesApiMockAgent(name="specialist", handoff_to="coordinator") + + workflow = ( + HandoffBuilder( + participants=[coordinator, specialist], + termination_condition=lambda conv: len(conv) >= 6, + ) + .with_start_agent(coordinator) + .add_handoff(coordinator, [specialist]) + .add_handoff(specialist, [coordinator]) + .build() + ) + + # Use non-streaming so conversation_id from ChatResponse propagates to session + result = await workflow.run("Research topic X", stream=False) + + # Verify handoffs occurred + handoff_events = [ev for ev in result if ev.type == "handoff_sent"] + assert len(handoff_events) >= 1, "At least one handoff should have occurred" + + # Get the coordinator executor and its underlying mock client + coordinator_executor = workflow.executors["coordinator"] + cloned_agent = coordinator_executor._agent # type: ignore[attr-defined] + mock_client = cast(ResponsesApiMockClient, cloned_agent.client) + + # The coordinator should have been called at least twice + assert len(mock_client.received_conversation_ids) >= 2, ( + f"Coordinator should have been called at least twice, " + f"but was called {len(mock_client.received_conversation_ids)} times" + ) + + # The 1st call to the coordinator has no conversation_id (first run, no prior response) + first_conversation_id = mock_client.received_conversation_ids[0] + + # The 2nd call should NOT receive the conversation_id from the 1st response + # (resp_coordinator_1), because that response contained the handoff function_call. + # The session's service_session_id should have been cleared after the handoff, + # so the 2nd call should receive None as conversation_id. + second_conversation_id = mock_client.received_conversation_ids[1] + assert second_conversation_id is None, ( + f"Coordinator's 2nd invocation should not receive a stale conversation_id, " + f"but got '{second_conversation_id}' (1st call had '{first_conversation_id}'). " + f"The stale response ID would cause 'No tool output found for function call' " + f"error with the real Responses API." + ) + + +async def test_handoff_preserves_full_context_for_resumed_agent(): + """A resumed agent should see full history, including the original user prompt.""" + coordinator = ResponsesApiMockAgent(name="coordinator", handoff_to="specialist") + specialist = ResponsesApiMockAgent(name="specialist", handoff_to="coordinator") + + workflow = ( + HandoffBuilder( + participants=[coordinator, specialist], + termination_condition=lambda conv: len(conv) >= 6, + ) + .with_start_agent(coordinator) + .add_handoff(coordinator, [specialist]) + .add_handoff(specialist, [coordinator]) + .build() + ) + + # Use non-streaming so conversation_id propagates to session + result = await workflow.run("Research topic X", stream=False) + + # Verify handoffs happened + handoff_events = [ev for ev in result if ev.type == "handoff_sent"] + assert len(handoff_events) >= 1, "At least one handoff should have occurred" + + # Get the coordinator executor and its underlying mock client + coordinator_executor = workflow.executors["coordinator"] + cloned_agent = coordinator_executor._agent # type: ignore[attr-defined] + mock_client = cast(ResponsesApiMockClient, cloned_agent.client) + + # The coordinator should have been called at least twice: + # 1st call: initial run with user message "Research topic X" + # 2nd call: after specialist hands back + assert len(mock_client.received_messages_per_call) >= 2, ( + f"Coordinator should have been called at least twice, " + f"but was called {len(mock_client.received_messages_per_call)} times" + ) + + # Check the 2nd call to the coordinator (after handoff back from specialist) + second_call_messages = mock_client.received_messages_per_call[1] + + # The second call should include the original user message + # "Research topic X" in its history. But because only _cache is passed + # (which only has the specialist's broadcast), the original user message is lost. + # + # With the Responses API, once service_session_id is set, InMemoryHistoryProvider + # is NOT auto-injected (see _prepare_run_context line 991). So the agent only + # sees _cache (partial history), not _full_conversation (complete history). + # + # The fix: use _full_conversation instead of _cache when running the agent + # after a handoff, so the agent sees the complete conversation history. + user_messages_in_second_call = [ + msg for msg in second_call_messages if msg.role == "user" and msg.text and "Research topic X" in msg.text + ] + assert len(user_messages_in_second_call) > 0, ( + f"Coordinator's 2nd invocation should include the original user message " + f"'Research topic X' but it's missing. The agent only received {len(second_call_messages)} messages: " + f"{[f'{m.role}: {m.text}' for m in second_call_messages]}. " + f"This means conversation context is lost after handoff." + ) + From aa6b6afea671490d6b0736470afabfb8ce3f5068 Mon Sep 17 00:00:00 2001 From: alliscode Date: Wed, 18 Feb 2026 16:10:18 -0800 Subject: [PATCH 2/2] Formatting files. --- python/packages/orchestrations/tests/test_handoff_responses.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/packages/orchestrations/tests/test_handoff_responses.py b/python/packages/orchestrations/tests/test_handoff_responses.py index ba809b48f4..ff645f8f1c 100644 --- a/python/packages/orchestrations/tests/test_handoff_responses.py +++ b/python/packages/orchestrations/tests/test_handoff_responses.py @@ -240,4 +240,3 @@ async def test_handoff_preserves_full_context_for_resumed_agent(): f"{[f'{m.role}: {m.text}' for m in second_call_messages]}. " f"This means conversation context is lost after handoff." ) -