From d7c39806b0b4760fc6c23f46d83923d0f352df67 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 17 Feb 2026 09:01:43 +0900 Subject: [PATCH 1/2] fix: emit tracing function spans for shell/apply_patch/computer runtime tools --- src/agents/run_internal/tool_actions.py | 563 +++++++++++++--------- src/agents/run_internal/tool_execution.py | 30 +- tests/test_apply_patch_tool.py | 90 +++- tests/test_computer_action.py | 97 ++++ tests/test_shell_tool.py | 82 ++++ 5 files changed, 638 insertions(+), 224 deletions(-) diff --git a/src/agents/run_internal/tool_actions.py b/src/agents/run_internal/tool_actions.py index 29086b4c67..010518dd7d 100644 --- a/src/agents/run_internal/tool_actions.py +++ b/src/agents/run_internal/tool_actions.py @@ -6,7 +6,9 @@ from __future__ import annotations import asyncio +import dataclasses import inspect +import json from typing import TYPE_CHECKING, Any, Literal, cast from openai.types.responses import ResponseComputerToolCall @@ -39,6 +41,7 @@ ShellResult, resolve_computer, ) +from ..tracing import SpanError from ..util import _coro from ..util._approvals import evaluate_needs_approval_setting from .items import apply_patch_rejection_item, shell_rejection_item @@ -47,6 +50,7 @@ coerce_shell_call, extract_apply_patch_call_id, format_shell_error, + get_trace_tool_error, normalize_apply_patch_result, normalize_max_output_length, normalize_shell_output, @@ -56,6 +60,7 @@ resolve_approval_status, serialize_shell_output, truncate_shell_outputs, + with_tool_function_span, ) if TYPE_CHECKING: @@ -75,6 +80,22 @@ ] +def _serialize_trace_payload(payload: Any) -> str: + """Serialize tool payloads for tracing while tolerating non-JSON values.""" + if payload is None: + return "" + if isinstance(payload, str): + return payload + if hasattr(payload, "model_dump") and callable(payload.model_dump): + return json.dumps(payload.model_dump(exclude_none=True)) + if dataclasses.is_dataclass(payload) and not isinstance(payload, type): + return json.dumps(dataclasses.asdict(payload)) + try: + return json.dumps(payload) + except TypeError: + return str(payload) + + class ComputerAction: """Execute computer tool actions and emit screenshot outputs with hooks fired.""" @@ -90,41 +111,76 @@ async def execute( acknowledged_safety_checks: list[ComputerCallOutputAcknowledgedSafetyCheck] | None = None, ) -> RunItem: """Run a computer action, capturing a screenshot and notifying hooks.""" - computer = await resolve_computer(tool=action.computer_tool, run_context=context_wrapper) - agent_hooks = agent.hooks - await asyncio.gather( - hooks.on_tool_start(context_wrapper, agent, action.computer_tool), - ( - agent_hooks.on_tool_start(context_wrapper, agent, action.computer_tool) - if agent_hooks - else _coro.noop_coroutine() - ), - ) - output = await cls._execute_action_and_capture(computer, action.tool_call) + async def _run_action(span: Any | None) -> RunItem: + if span and config.trace_include_sensitive_data: + span.span_data.input = _serialize_trace_payload(action.tool_call.action) - await asyncio.gather( - hooks.on_tool_end(context_wrapper, agent, action.computer_tool, output), - ( - agent_hooks.on_tool_end(context_wrapper, agent, action.computer_tool, output) - if agent_hooks - else _coro.noop_coroutine() - ), - ) + computer = await resolve_computer( + tool=action.computer_tool, run_context=context_wrapper + ) + agent_hooks = agent.hooks + await asyncio.gather( + hooks.on_tool_start(context_wrapper, agent, action.computer_tool), + ( + agent_hooks.on_tool_start(context_wrapper, agent, action.computer_tool) + if agent_hooks + else _coro.noop_coroutine() + ), + ) - image_url = f"data:image/png;base64,{output}" - return ToolCallOutputItem( - agent=agent, - output=image_url, - raw_item=ComputerCallOutput( - call_id=action.tool_call.call_id, - output={ - "type": "computer_screenshot", - "image_url": image_url, - }, - type="computer_call_output", - acknowledged_safety_checks=acknowledged_safety_checks, - ), + output = "" + try: + output = await cls._execute_action_and_capture(computer, action.tool_call) + except Exception as exc: + error_text = format_shell_error(exc) + trace_error = get_trace_tool_error( + trace_include_sensitive_data=config.trace_include_sensitive_data, + error_message=error_text, + ) + if span: + span.set_error( + SpanError( + message="Error running tool", + data={ + "tool_name": action.computer_tool.name, + "error": trace_error, + }, + ) + ) + logger.error("Failed to execute computer action: %s", exc, exc_info=True) + + await asyncio.gather( + hooks.on_tool_end(context_wrapper, agent, action.computer_tool, output), + ( + agent_hooks.on_tool_end(context_wrapper, agent, action.computer_tool, output) + if agent_hooks + else _coro.noop_coroutine() + ), + ) + + image_url = f"data:image/png;base64,{output}" if output else "" + if span and config.trace_include_sensitive_data: + span.span_data.output = image_url + + return ToolCallOutputItem( + agent=agent, + output=image_url, + raw_item=ComputerCallOutput( + call_id=action.tool_call.call_id, + output={ + "type": "computer_screenshot", + "image_url": image_url, + }, + type="computer_call_output", + acknowledged_safety_checks=acknowledged_safety_checks, + ), + ) + + return await with_tool_function_span( + config=config, + tool_name=action.computer_tool.name, + fn=_run_action, ) @classmethod @@ -234,134 +290,167 @@ async def execute( shell_tool = call.shell_tool agent_hooks = agent.hooks - needs_approval_result = await evaluate_needs_approval_setting( - shell_tool.needs_approval, context_wrapper, shell_call.action, shell_call.call_id - ) + async def _run_call(span: Any | None) -> RunItem: + if span and config.trace_include_sensitive_data: + span.span_data.input = _serialize_trace_payload( + dataclasses.asdict(shell_call.action) + ) - if needs_approval_result: - approval_status, approval_item = await resolve_approval_status( - tool_name=shell_tool.name, - call_id=shell_call.call_id, - raw_item=call.tool_call, - agent=agent, - context_wrapper=context_wrapper, - on_approval=shell_tool.on_approval, + needs_approval_result = await evaluate_needs_approval_setting( + shell_tool.needs_approval, context_wrapper, shell_call.action, shell_call.call_id ) - if approval_status is False: - rejection_message = await resolve_approval_rejection_message( - context_wrapper=context_wrapper, - run_config=config, - tool_type="shell", + if needs_approval_result: + approval_status, approval_item = await resolve_approval_status( tool_name=shell_tool.name, call_id=shell_call.call_id, - ) - return shell_rejection_item( - agent, - shell_call.call_id, - rejection_message=rejection_message, + raw_item=call.tool_call, + agent=agent, + context_wrapper=context_wrapper, + on_approval=shell_tool.on_approval, ) - if approval_status is not True: - return approval_item - - await asyncio.gather( - hooks.on_tool_start(context_wrapper, agent, shell_tool), - ( - agent_hooks.on_tool_start(context_wrapper, agent, shell_tool) - if agent_hooks - else _coro.noop_coroutine() - ), - ) - request = ShellCommandRequest(ctx_wrapper=context_wrapper, data=shell_call) - status: Literal["completed", "failed"] = "completed" - output_text = "" - shell_output_payload: list[dict[str, Any]] | None = None - provider_meta: dict[str, Any] | None = None - max_output_length: int | None = None - requested_max_output_length = normalize_max_output_length( - shell_call.action.max_output_length - ) - - try: - executor = call.shell_tool.executor - if executor is None: - raise ModelBehaviorError("Shell tool has no local executor configured.") - executor_result = executor(request) - result = ( - await executor_result if inspect.isawaitable(executor_result) else executor_result + if approval_status is False: + rejection_message = await resolve_approval_rejection_message( + context_wrapper=context_wrapper, + run_config=config, + tool_type="shell", + tool_name=shell_tool.name, + call_id=shell_call.call_id, + ) + return shell_rejection_item( + agent, + shell_call.call_id, + rejection_message=rejection_message, + ) + + if approval_status is not True: + return approval_item + + await asyncio.gather( + hooks.on_tool_start(context_wrapper, agent, shell_tool), + ( + agent_hooks.on_tool_start(context_wrapper, agent, shell_tool) + if agent_hooks + else _coro.noop_coroutine() + ), + ) + request = ShellCommandRequest(ctx_wrapper=context_wrapper, data=shell_call) + status: Literal["completed", "failed"] = "completed" + output_text = "" + shell_output_payload: list[dict[str, Any]] | None = None + provider_meta: dict[str, Any] | None = None + max_output_length: int | None = None + requested_max_output_length = normalize_max_output_length( + shell_call.action.max_output_length ) - if isinstance(result, ShellResult): - normalized = [normalize_shell_output(entry) for entry in result.output] - result_max_output_length = normalize_max_output_length(result.max_output_length) - if result_max_output_length is None: - max_output_length = requested_max_output_length - elif requested_max_output_length is None: - max_output_length = result_max_output_length + try: + executor = call.shell_tool.executor + if executor is None: + raise ModelBehaviorError("Shell tool has no local executor configured.") + executor_result = executor(request) + result = ( + await executor_result + if inspect.isawaitable(executor_result) + else executor_result + ) + + if isinstance(result, ShellResult): + normalized = [normalize_shell_output(entry) for entry in result.output] + result_max_output_length = normalize_max_output_length(result.max_output_length) + if result_max_output_length is None: + max_output_length = requested_max_output_length + elif requested_max_output_length is None: + max_output_length = result_max_output_length + else: + max_output_length = min( + result_max_output_length, requested_max_output_length + ) + if max_output_length is not None: + normalized = truncate_shell_outputs(normalized, max_output_length) + output_text = render_shell_outputs(normalized) + if max_output_length is not None: + output_text = output_text[:max_output_length] + shell_output_payload = [serialize_shell_output(entry) for entry in normalized] + provider_meta = dict(result.provider_data or {}) else: - max_output_length = min(result_max_output_length, requested_max_output_length) - if max_output_length is not None: - normalized = truncate_shell_outputs(normalized, max_output_length) - output_text = render_shell_outputs(normalized) - if max_output_length is not None: - output_text = output_text[:max_output_length] - shell_output_payload = [serialize_shell_output(entry) for entry in normalized] - provider_meta = dict(result.provider_data or {}) - else: - output_text = str(result) + output_text = str(result) + if requested_max_output_length is not None: + max_output_length = requested_max_output_length + output_text = output_text[:max_output_length] + except Exception as exc: + status = "failed" + output_text = format_shell_error(exc) + trace_error = get_trace_tool_error( + trace_include_sensitive_data=config.trace_include_sensitive_data, + error_message=output_text, + ) + if span: + span.set_error( + SpanError( + message="Error running tool", + data={ + "tool_name": shell_tool.name, + "error": trace_error, + }, + ) + ) if requested_max_output_length is not None: max_output_length = requested_max_output_length output_text = output_text[:max_output_length] - except Exception as exc: - status = "failed" - output_text = format_shell_error(exc) - if requested_max_output_length is not None: - max_output_length = requested_max_output_length - output_text = output_text[:max_output_length] - logger.error("Shell executor failed: %s", exc, exc_info=True) - - await asyncio.gather( - hooks.on_tool_end(context_wrapper, agent, call.shell_tool, output_text), - ( - agent_hooks.on_tool_end(context_wrapper, agent, call.shell_tool, output_text) - if agent_hooks - else _coro.noop_coroutine() - ), - ) + logger.error("Shell executor failed: %s", exc, exc_info=True) + + await asyncio.gather( + hooks.on_tool_end(context_wrapper, agent, call.shell_tool, output_text), + ( + agent_hooks.on_tool_end(context_wrapper, agent, call.shell_tool, output_text) + if agent_hooks + else _coro.noop_coroutine() + ), + ) - raw_entries: list[dict[str, Any]] | None = None - if shell_output_payload: - raw_entries = shell_output_payload - elif output_text: - raw_entries = [ - { - "stdout": output_text, - "stderr": "", - "status": status, - "outcome": "success" if status == "completed" else "failure", - } - ] - - structured_output = normalize_shell_output_entries(raw_entries) if raw_entries else [] - - raw_item: dict[str, Any] = { - "type": "shell_call_output", - "call_id": shell_call.call_id, - "output": structured_output, - "status": status, - } - if max_output_length is not None: - raw_item["max_output_length"] = max_output_length - if raw_entries: - raw_item["shell_output"] = raw_entries - if provider_meta: - raw_item["provider_data"] = provider_meta + raw_entries: list[dict[str, Any]] | None = None + if shell_output_payload: + raw_entries = shell_output_payload + elif output_text: + raw_entries = [ + { + "stdout": output_text, + "stderr": "", + "status": status, + "outcome": "success" if status == "completed" else "failure", + } + ] + + structured_output = normalize_shell_output_entries(raw_entries) if raw_entries else [] + + raw_item: dict[str, Any] = { + "type": "shell_call_output", + "call_id": shell_call.call_id, + "output": structured_output, + "status": status, + } + if max_output_length is not None: + raw_item["max_output_length"] = max_output_length + if raw_entries: + raw_item["shell_output"] = raw_entries + if provider_meta: + raw_item["provider_data"] = provider_meta + + if span and config.trace_include_sensitive_data: + span.span_data.output = output_text + + return ToolCallOutputItem( + agent=agent, + output=output_text, + raw_item=raw_item, + ) - return ToolCallOutputItem( - agent=agent, - output=output_text, - raw_item=raw_item, + return await with_tool_function_span( + config=config, + tool_name=shell_tool.name, + fn=_run_call, ) @@ -385,96 +474,128 @@ async def execute( call.tool_call, context_wrapper=context_wrapper, ) - call_id = extract_apply_patch_call_id(call.tool_call) - needs_approval_result = await evaluate_needs_approval_setting( - apply_patch_tool.needs_approval, context_wrapper, operation, call_id - ) + async def _run_call(span: Any | None) -> RunItem: + if span and config.trace_include_sensitive_data: + span.span_data.input = _serialize_trace_payload( + { + "type": operation.type, + "path": operation.path, + "diff": operation.diff, + } + ) - if needs_approval_result: - approval_status, approval_item = await resolve_approval_status( - tool_name=apply_patch_tool.name, - call_id=call_id, - raw_item=call.tool_call, - agent=agent, - context_wrapper=context_wrapper, - on_approval=apply_patch_tool.on_approval, + needs_approval_result = await evaluate_needs_approval_setting( + apply_patch_tool.needs_approval, context_wrapper, operation, call_id ) - if approval_status is False: - rejection_message = await resolve_approval_rejection_message( - context_wrapper=context_wrapper, - run_config=config, - tool_type="apply_patch", + if needs_approval_result: + approval_status, approval_item = await resolve_approval_status( tool_name=apply_patch_tool.name, call_id=call_id, - ) - return apply_patch_rejection_item( - agent, - call_id, - rejection_message=rejection_message, + raw_item=call.tool_call, + agent=agent, + context_wrapper=context_wrapper, + on_approval=apply_patch_tool.on_approval, ) - if approval_status is not True: - return approval_item + if approval_status is False: + rejection_message = await resolve_approval_rejection_message( + context_wrapper=context_wrapper, + run_config=config, + tool_type="apply_patch", + tool_name=apply_patch_tool.name, + call_id=call_id, + ) + return apply_patch_rejection_item( + agent, + call_id, + rejection_message=rejection_message, + ) + + if approval_status is not True: + return approval_item + + await asyncio.gather( + hooks.on_tool_start(context_wrapper, agent, apply_patch_tool), + ( + agent_hooks.on_tool_start(context_wrapper, agent, apply_patch_tool) + if agent_hooks + else _coro.noop_coroutine() + ), + ) - await asyncio.gather( - hooks.on_tool_start(context_wrapper, agent, apply_patch_tool), - ( - agent_hooks.on_tool_start(context_wrapper, agent, apply_patch_tool) - if agent_hooks - else _coro.noop_coroutine() - ), - ) + status: Literal["completed", "failed"] = "completed" + output_text = "" + + try: + editor = apply_patch_tool.editor + if operation.type == "create_file": + result = editor.create_file(operation) + elif operation.type == "update_file": + result = editor.update_file(operation) + elif operation.type == "delete_file": + result = editor.delete_file(operation) + else: # pragma: no cover - validated in coerce_apply_patch_operation + raise ModelBehaviorError(f"Unsupported apply_patch operation: {operation.type}") + + awaited = await result if inspect.isawaitable(result) else result + normalized = normalize_apply_patch_result(awaited) + if normalized: + if normalized.status in {"completed", "failed"}: + status = normalized.status + if normalized.output: + output_text = normalized.output + except Exception as exc: + status = "failed" + output_text = format_shell_error(exc) + trace_error = get_trace_tool_error( + trace_include_sensitive_data=config.trace_include_sensitive_data, + error_message=output_text, + ) + if span: + span.set_error( + SpanError( + message="Error running tool", + data={ + "tool_name": apply_patch_tool.name, + "error": trace_error, + }, + ) + ) + logger.error("Apply patch editor failed: %s", exc, exc_info=True) + + await asyncio.gather( + hooks.on_tool_end(context_wrapper, agent, apply_patch_tool, output_text), + ( + agent_hooks.on_tool_end(context_wrapper, agent, apply_patch_tool, output_text) + if agent_hooks + else _coro.noop_coroutine() + ), + ) - status: Literal["completed", "failed"] = "completed" - output_text = "" - - try: - editor = apply_patch_tool.editor - if operation.type == "create_file": - result = editor.create_file(operation) - elif operation.type == "update_file": - result = editor.update_file(operation) - elif operation.type == "delete_file": - result = editor.delete_file(operation) - else: # pragma: no cover - validated in coerce_apply_patch_operation - raise ModelBehaviorError(f"Unsupported apply_patch operation: {operation.type}") - - awaited = await result if inspect.isawaitable(result) else result - normalized = normalize_apply_patch_result(awaited) - if normalized: - if normalized.status in {"completed", "failed"}: - status = normalized.status - if normalized.output: - output_text = normalized.output - except Exception as exc: - status = "failed" - output_text = format_shell_error(exc) - logger.error("Apply patch editor failed: %s", exc, exc_info=True) + raw_item: dict[str, Any] = { + "type": "apply_patch_call_output", + "call_id": call_id, + "status": status, + } + if output_text: + raw_item["output"] = output_text - await asyncio.gather( - hooks.on_tool_end(context_wrapper, agent, apply_patch_tool, output_text), - ( - agent_hooks.on_tool_end(context_wrapper, agent, apply_patch_tool, output_text) - if agent_hooks - else _coro.noop_coroutine() - ), - ) + if span and config.trace_include_sensitive_data: + span.span_data.output = output_text - raw_item: dict[str, Any] = { - "type": "apply_patch_call_output", - "call_id": call_id, - "status": status, - } - if output_text: - raw_item["output"] = output_text + return ToolCallOutputItem( + agent=agent, + output=output_text, + raw_item=raw_item, + ) - return ToolCallOutputItem( - agent=agent, - output=output_text, - raw_item=raw_item, + return await with_tool_function_span( + config=config, + tool_name=apply_patch_tool.name, + fn=_run_call, ) diff --git a/src/agents/run_internal/tool_execution.py b/src/agents/run_internal/tool_execution.py index a51955dee2..cdac898a3b 100644 --- a/src/agents/run_internal/tool_execution.py +++ b/src/agents/run_internal/tool_execution.py @@ -10,7 +10,7 @@ import inspect import json from collections.abc import Callable, Mapping, Sequence -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast from openai.types.responses import ResponseFunctionToolCall from openai.types.responses.response_input_item_param import ( @@ -62,7 +62,7 @@ ToolOutputGuardrailData, ToolOutputGuardrailResult, ) -from ..tracing import SpanError, function_span +from ..tracing import Span, SpanError, function_span, get_current_trace from ..util import _coro, _error_tracing from ..util._approvals import evaluate_needs_approval_setting from .approvals import append_approval_error_output @@ -104,6 +104,8 @@ "normalize_max_output_length", "normalize_shell_output_entries", "format_shell_error", + "get_trace_tool_error", + "with_tool_function_span", "build_litellm_json_tool_call", "process_hosted_mcp_approvals", "collect_manual_mcp_approvals", @@ -121,6 +123,9 @@ "execute_approved_tools", ] +REDACTED_TOOL_ERROR_MESSAGE = "Tool execution failed. Error details are redacted." +TToolSpanResult = TypeVar("TToolSpanResult") + # -------------------------- # Public helpers @@ -531,6 +536,27 @@ def format_shell_error(error: Exception | BaseException | Any) -> str: return repr(error) +def get_trace_tool_error(*, trace_include_sensitive_data: bool, error_message: str) -> str: + """Return a trace-safe tool error string based on the sensitive-data setting.""" + return error_message if trace_include_sensitive_data else REDACTED_TOOL_ERROR_MESSAGE + + +async def with_tool_function_span( + *, + config: RunConfig, + tool_name: str, + fn: Callable[[Span[Any] | None], Any], +) -> TToolSpanResult: + """Execute a tool callback in a function span when tracing is active.""" + if config.tracing_disabled or get_current_trace() is None: + result = fn(None) + return await result if inspect.isawaitable(result) else cast(TToolSpanResult, result) + + with function_span(tool_name) as span: + result = fn(span) + return await result if inspect.isawaitable(result) else cast(TToolSpanResult, result) + + def build_litellm_json_tool_call(output: ResponseFunctionToolCall) -> FunctionTool: """Wrap a JSON string result in a FunctionTool so LiteLLM can stream it.""" diff --git a/tests/test_apply_patch_tool.py b/tests/test_apply_patch_tool.py index 1c07ab2374..4a7e581cef 100644 --- a/tests/test_apply_patch_tool.py +++ b/tests/test_apply_patch_tool.py @@ -1,15 +1,25 @@ from __future__ import annotations +import json from dataclasses import dataclass from typing import Any, cast import pytest -from agents import Agent, ApplyPatchTool, RunConfig, RunContextWrapper, RunHooks +from agents import ( + Agent, + ApplyPatchTool, + RunConfig, + RunContextWrapper, + RunHooks, + set_tracing_disabled, + trace, +) from agents.editor import ApplyPatchOperation, ApplyPatchResult from agents.items import ToolApprovalItem, ToolCallOutputItem from agents.run_internal.run_loop import ApplyPatchAction, ToolRunApplyPatchCall +from .testing_processor import SPAN_PROCESSOR_TESTING from .utils.hitl import ( HITL_REJECTION_MSG, make_context_wrapper, @@ -19,6 +29,19 @@ ) +def _get_function_span(tool_name: str) -> dict[str, Any]: + for span in SPAN_PROCESSOR_TESTING.get_ordered_spans(including_empty=True): + exported = span.export() + if not exported: + continue + span_data = exported.get("span_data") + if not isinstance(span_data, dict): + continue + if span_data.get("type") == "function" and span_data.get("name") == tool_name: + return exported + raise AssertionError(f"Function span for tool '{tool_name}' not found") + + def _call(call_id: str, operation: dict[str, Any]) -> DummyApplyPatchCall: return DummyApplyPatchCall(type="apply_patch_call", call_id=call_id, operation=operation) @@ -124,6 +147,71 @@ def update_file(self, operation: ApplyPatchOperation) -> ApplyPatchResult: assert payload_dict["status"] == "failed" +@pytest.mark.asyncio +async def test_apply_patch_tool_emits_function_span() -> None: + editor = RecordingEditor() + tool = ApplyPatchTool(editor=editor) + agent, context_wrapper, tool_run = build_apply_patch_call( + tool, "call_apply_trace", {"type": "update_file", "path": "tasks.md", "diff": "-a\n+b\n"} + ) + + set_tracing_disabled(False) + with trace("apply-patch-span-test"): + result = await ApplyPatchAction.execute( + agent=agent, + call=tool_run, + hooks=RunHooks[Any](), + context_wrapper=context_wrapper, + config=RunConfig(), + ) + + assert isinstance(result, ToolCallOutputItem) + function_span = _get_function_span(tool.name) + span_data = cast(dict[str, Any], function_span["span_data"]) + assert "tasks.md" in cast(str, span_data.get("input", "")) + assert "Updated tasks.md" in cast(str, span_data.get("output", "")) + + +@pytest.mark.asyncio +async def test_apply_patch_tool_redacts_span_error_when_sensitive_data_disabled() -> None: + secret_error = "patch secret output" + + class ExplodingEditor(RecordingEditor): + def update_file(self, operation: ApplyPatchOperation) -> ApplyPatchResult: + raise RuntimeError(secret_error) + + tool = ApplyPatchTool(editor=ExplodingEditor()) + agent, context_wrapper, tool_run = build_apply_patch_call( + tool, + "call_apply_trace_redacted", + {"type": "update_file", "path": "tasks.md", "diff": "-a\n+b\n"}, + ) + + set_tracing_disabled(False) + with trace("apply-patch-span-redaction-test"): + result = await ApplyPatchAction.execute( + agent=agent, + call=tool_run, + hooks=RunHooks[Any](), + context_wrapper=context_wrapper, + config=RunConfig(trace_include_sensitive_data=False), + ) + + assert isinstance(result, ToolCallOutputItem) + function_span = _get_function_span(tool.name) + assert function_span.get("error") == { + "message": "Error running tool", + "data": { + "tool_name": tool.name, + "error": "Tool execution failed. Error details are redacted.", + }, + } + assert secret_error not in json.dumps(function_span) + span_data = cast(dict[str, Any], function_span["span_data"]) + assert span_data.get("input") is None + assert span_data.get("output") is None + + @pytest.mark.asyncio async def test_apply_patch_tool_accepts_mapping_call() -> None: editor = RecordingEditor() diff --git a/tests/test_computer_action.py b/tests/test_computer_action.py index fda553626b..e79a2d27ed 100644 --- a/tests/test_computer_action.py +++ b/tests/test_computer_action.py @@ -4,6 +4,7 @@ that screenshots are taken and wrapped appropriately, and that the execute function invokes hooks and returns the expected ToolCallOutputItem.""" +import json from typing import Any, cast import pytest @@ -31,12 +32,29 @@ RunConfig, RunContextWrapper, RunHooks, + set_tracing_disabled, + trace, ) from agents.items import ToolCallOutputItem from agents.run_internal import run_loop from agents.run_internal.run_loop import ComputerAction, ToolRunComputerAction from agents.tool import ComputerToolSafetyCheckData +from .testing_processor import SPAN_PROCESSOR_TESTING + + +def _get_function_span(tool_name: str) -> dict[str, Any]: + for span in SPAN_PROCESSOR_TESTING.get_ordered_spans(including_empty=True): + exported = span.export() + if not exported: + continue + span_data = exported.get("span_data") + if not isinstance(span_data, dict): + continue + if span_data.get("type") == "function" and span_data.get("name") == tool_name: + return exported + raise AssertionError(f"Function span for tool '{tool_name}' not found") + class LoggingComputer(Computer): """A `Computer` implementation that logs calls to its methods for verification in tests.""" @@ -313,6 +331,85 @@ async def test_execute_invokes_hooks_and_returns_tool_call_output() -> None: assert raw["output"]["image_url"].endswith("xyz") +@pytest.mark.asyncio +async def test_execute_emits_function_span() -> None: + computer = LoggingComputer(screenshot_return="trace_img") + comptool = ComputerTool(computer=computer) + tool_call = ResponseComputerToolCall( + id="tool_trace", + type="computer_call", + action=ActionScreenshot(type="screenshot"), + call_id="tool_trace", + pending_safety_checks=[], + status="completed", + ) + tool_run = ToolRunComputerAction(tool_call=tool_call, computer_tool=comptool) + agent = Agent(name="test_agent_trace", tools=[comptool]) + + set_tracing_disabled(False) + with trace("computer-span-test"): + result = await ComputerAction.execute( + agent=agent, + action=tool_run, + hooks=RunHooks[Any](), + context_wrapper=RunContextWrapper(context=None), + config=RunConfig(), + ) + + assert isinstance(result, ToolCallOutputItem) + function_span = _get_function_span(comptool.name) + span_data = cast(dict[str, Any], function_span["span_data"]) + assert span_data.get("input") is not None + assert cast(str, span_data.get("output", "")).startswith("data:image/png;base64,") + + +@pytest.mark.asyncio +async def test_execute_redacts_span_error_when_sensitive_data_disabled() -> None: + secret_error = "computer secret output" + + class FailingComputer(LoggingComputer): + def screenshot(self) -> str: + raise RuntimeError(secret_error) + + computer = FailingComputer() + comptool = ComputerTool(computer=computer) + tool_call = ResponseComputerToolCall( + id="tool_trace_error", + type="computer_call", + action=ActionScreenshot(type="screenshot"), + call_id="tool_trace_error", + pending_safety_checks=[], + status="completed", + ) + tool_run = ToolRunComputerAction(tool_call=tool_call, computer_tool=comptool) + agent = Agent(name="test_agent_trace_error", tools=[comptool]) + + set_tracing_disabled(False) + with trace("computer-span-redaction-test"): + result = await ComputerAction.execute( + agent=agent, + action=tool_run, + hooks=RunHooks[Any](), + context_wrapper=RunContextWrapper(context=None), + config=RunConfig(trace_include_sensitive_data=False), + ) + + assert isinstance(result, ToolCallOutputItem) + assert result.output == "" + function_span = _get_function_span(comptool.name) + assert function_span.get("error") == { + "message": "Error running tool", + "data": { + "tool_name": comptool.name, + "error": "Tool execution failed. Error details are redacted.", + }, + } + assert secret_error not in json.dumps(function_span) + span_data = cast(dict[str, Any], function_span["span_data"]) + assert span_data.get("input") is None + assert span_data.get("output") is None + + @pytest.mark.asyncio async def test_pending_safety_check_acknowledged() -> None: """Safety checks should be acknowledged via the callback.""" diff --git a/tests/test_shell_tool.py b/tests/test_shell_tool.py index 2ec532f049..b513388d37 100644 --- a/tests/test_shell_tool.py +++ b/tests/test_shell_tool.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json from typing import Any, cast import pytest @@ -14,10 +15,13 @@ ShellResult, ShellTool, UserError, + set_tracing_disabled, + trace, ) from agents.items import ToolApprovalItem, ToolCallOutputItem from agents.run_internal.run_loop import ShellAction, ToolRunShellCall, execute_shell_calls +from .testing_processor import SPAN_PROCESSOR_TESTING from .utils.hitl import ( HITL_REJECTION_MSG, make_context_wrapper, @@ -29,6 +33,19 @@ ) +def _get_function_span(tool_name: str) -> dict[str, Any]: + for span in SPAN_PROCESSOR_TESTING.get_ordered_spans(including_empty=True): + exported = span.export() + if not exported: + continue + span_data = exported.get("span_data") + if not isinstance(span_data, dict): + continue + if span_data.get("type") == "function" and span_data.get("name") == tool_name: + return exported + raise AssertionError(f"Function span for tool '{tool_name}' not found") + + def _shell_call(call_id: str = "call_shell") -> dict[str, Any]: return cast( dict[str, Any], @@ -268,6 +285,71 @@ async def test_shell_tool_structured_output_is_rendered() -> None: assert "provider_data" not in payload_dict +@pytest.mark.asyncio +async def test_shell_tool_emits_function_span() -> None: + shell_tool = ShellTool(executor=lambda request: "shell span output") + tool_run = ToolRunShellCall(tool_call=_shell_call("call_shell_trace"), shell_tool=shell_tool) + agent = Agent(name="shell-agent", tools=[shell_tool]) + context_wrapper: RunContextWrapper[Any] = RunContextWrapper(context=None) + + set_tracing_disabled(False) + with trace("shell-span-test"): + result = await ShellAction.execute( + agent=agent, + call=tool_run, + hooks=RunHooks[Any](), + context_wrapper=context_wrapper, + config=RunConfig(), + ) + + assert isinstance(result, ToolCallOutputItem) + function_span = _get_function_span(shell_tool.name) + span_data = cast(dict[str, Any], function_span["span_data"]) + assert "echo hi" in cast(str, span_data.get("input", "")) + assert span_data.get("output") == "shell span output" + + +@pytest.mark.asyncio +async def test_shell_tool_redacts_span_error_when_sensitive_data_disabled() -> None: + secret_error = "shell secret output" + + class ExplodingExecutor: + def __call__(self, request): + raise RuntimeError(secret_error) + + shell_tool = ShellTool(executor=ExplodingExecutor()) + tool_run = ToolRunShellCall( + tool_call=_shell_call("call_shell_trace_redacted"), + shell_tool=shell_tool, + ) + agent = Agent(name="shell-agent", tools=[shell_tool]) + context_wrapper: RunContextWrapper[Any] = RunContextWrapper(context=None) + + set_tracing_disabled(False) + with trace("shell-span-redaction-test"): + result = await ShellAction.execute( + agent=agent, + call=tool_run, + hooks=RunHooks[Any](), + context_wrapper=context_wrapper, + config=RunConfig(trace_include_sensitive_data=False), + ) + + assert isinstance(result, ToolCallOutputItem) + function_span = _get_function_span(shell_tool.name) + assert function_span.get("error") == { + "message": "Error running tool", + "data": { + "tool_name": shell_tool.name, + "error": "Tool execution failed. Error details are redacted.", + }, + } + assert secret_error not in json.dumps(function_span) + span_data = cast(dict[str, Any], function_span["span_data"]) + assert span_data.get("input") is None + assert span_data.get("output") is None + + @pytest.mark.asyncio async def test_shell_tool_executor_failure_returns_error() -> None: class ExplodingExecutor: From 4be943580b2060ae8c338a2c48f1e143485131f7 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 17 Feb 2026 09:15:14 +0900 Subject: [PATCH 2/2] fix review comment --- src/agents/run_internal/tool_actions.py | 2 +- tests/test_computer_action.py | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/agents/run_internal/tool_actions.py b/src/agents/run_internal/tool_actions.py index 010518dd7d..41607913fc 100644 --- a/src/agents/run_internal/tool_actions.py +++ b/src/agents/run_internal/tool_actions.py @@ -129,7 +129,6 @@ async def _run_action(span: Any | None) -> RunItem: ), ) - output = "" try: output = await cls._execute_action_and_capture(computer, action.tool_call) except Exception as exc: @@ -149,6 +148,7 @@ async def _run_action(span: Any | None) -> RunItem: ) ) logger.error("Failed to execute computer action: %s", exc, exc_info=True) + raise await asyncio.gather( hooks.on_tool_end(context_wrapper, agent, action.computer_tool, output), diff --git a/tests/test_computer_action.py b/tests/test_computer_action.py index e79a2d27ed..75d3448d60 100644 --- a/tests/test_computer_action.py +++ b/tests/test_computer_action.py @@ -386,16 +386,15 @@ def screenshot(self) -> str: set_tracing_disabled(False) with trace("computer-span-redaction-test"): - result = await ComputerAction.execute( - agent=agent, - action=tool_run, - hooks=RunHooks[Any](), - context_wrapper=RunContextWrapper(context=None), - config=RunConfig(trace_include_sensitive_data=False), - ) + with pytest.raises(RuntimeError, match=secret_error): + await ComputerAction.execute( + agent=agent, + action=tool_run, + hooks=RunHooks[Any](), + context_wrapper=RunContextWrapper(context=None), + config=RunConfig(trace_include_sensitive_data=False), + ) - assert isinstance(result, ToolCallOutputItem) - assert result.output == "" function_span = _get_function_span(comptool.name) assert function_span.get("error") == { "message": "Error running tool",