Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,8 @@ def _openai_chat_message_parser(
args["content"].append(self._openai_content_parser(message.role, content, call_id_to_id)) # type: ignore
if "content" in args or "tool_calls" in args:
all_messages.append(args)
elif message.raw_representation:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this is a good idea, there is a reason we have not created abstractions for computer use, and it's because the variety and complexity of the code needed to handle the input and outputs of it across platforms is too complex for our purposes. Adding a raw_representation as a input goes against all that we do and I think if a dev needs this kind of special behavior then they are probably better off building directly against an SDK anyway since it is not abstracted, so it's not like they will be able to swap in and out between models and therefore the added value is low, and putting this method in, might break some other things, and putting this sample in implies we support this scenario, while we really don't...

all_messages.append(message.raw_representation)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would avoid using raw_representation as input. As far as I know, currently we use this property as output only, unless I miss something. Instead of using raw_representation as input, we can:

  • Allow to pass dict as part of ChatMessage.contents, which will enable breaking-glass scenario for message content types as input.
  • Add new content type for computer use tool. I think this would be a preferred approach, since this tool type exists in both OpenAI Responses API and Azure AI.

Copy link
Member

@eavanvalkenburg eavanvalkenburg Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that was my comment as well (didn't see this before), and we had a ADR PR discussing the potential of a set of computer use types and decided against it: #796 (comment)

return all_messages

def _openai_content_parser(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1432,6 +1432,15 @@ async def test_prepare_options_store_parameter_handling() -> None:
assert "previous_response_id" not in options


async def test_prepare_options_message_raw_representation_handling() -> None:
client = OpenAIResponsesClient(model_id="test-model", api_key="test-key")
messages = [ChatMessage(role="user", contents=[], raw_representation={"some": "data"})]

chat_options = ChatOptions()
options = await client.prepare_options(messages, chat_options)
assert options["input"] == [{"some": "data"}]


def test_openai_responses_client_with_callable_api_key() -> None:
"""Test OpenAIResponsesClient initialization with callable API key."""

Expand Down
1 change: 1 addition & 0 deletions python/samples/getting_started/agents/azure_ai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ This folder contains examples demonstrating different ways to create and use age
| [`azure_ai_with_thread.py`](azure_ai_with_thread.py) | Demonstrates thread management with Azure AI agents, including automatic thread creation for stateless conversations and explicit thread management for maintaining conversation context across multiple interactions. |
| [`azure_ai_with_image_generation.py`](azure_ai_with_image_generation.py) | Shows how to use the `ImageGenTool` with Azure AI agents to generate images based on text prompts. |
| [`azure_ai_with_web_search.py`](azure_ai_with_web_search.py) | Shows how to use the `HostedWebSearchTool` with Azure AI agents to perform web searches and retrieve up-to-date information from the internet. |
| [`azure_ai_with_computer_use.py`](azure_ai_with_computer_use.py) | Shows how to use the `ComputerUsePreviewTool` with Azure AI agents to perform actions on a simulated computer, which uses existing screenshots. |

## Environment Variables

Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio
import base64
import os
from enum import Enum

from agent_framework import ChatMessage, ChatResponse, DataContent, Role, TextContent
from agent_framework.azure import AzureAIClient
from azure.ai.projects.models import ComputerUsePreviewTool
from azure.identity.aio import AzureCliCredential
from openai.types.responses import ResponseComputerToolCall
from openai.types.responses.response import Response
from openai.types.responses.response_computer_tool_call import Action

"""
Azure AI Agent With Computer Use Tool

This sample demonstrates basic usage of AzureAIClient to create an agent
that can perform computer automation tasks using the ComputerUsePreviewTool.

Pre-requisites:
- Make sure to set up the AZURE_AI_PROJECT_ENDPOINT.
- Make sure to deploy a model that supports the computer use tool, currently "computer-use-preview".

Note that the computer operations in this sample are simulated for demonstration purposes.
"""


class SearchState(Enum):
"""Enum for tracking the state of the simulated web search workflow."""

INITIAL = "initial" # Browser search page
TYPED = "typed" # Text entered in search box
PRESSED_ENTER = "pressed_enter" # Enter key pressed, transitioning to results


def image_to_base64(image_path: str) -> str:
"""Convert an image file to a Base64-encoded string.

Args:
image_path: The path to the image file (e.g. 'image_file.png')

Returns:
A Base64-encoded string representing the image.

Raises:
FileNotFoundError: If the provided file path does not exist.
OSError: If there's an error reading the file.
"""
if not os.path.isfile(image_path):
raise FileNotFoundError(f"File not found at: {image_path}")

try:
with open(image_path, "rb") as image_file:
file_data = image_file.read()
return base64.b64encode(file_data).decode("utf-8")
except Exception as exc:
raise OSError(f"Error reading file '{image_path}'") from exc


def load_screenshot_assets() -> dict[str, dict[str, str]]:
"""Load and convert screenshot images to base64 data URLs.

Returns:
dict: Dictionary mapping state names to screenshot info with filename and data URL

Raises:
FileNotFoundError: If any required screenshot asset files are missing
"""
# Load demo screenshot images from assets directory
# Flow: search page -> typed search -> search results
screenshot_paths = {
"browser_search": os.path.abspath(os.path.join(os.path.dirname(__file__), "./assets/cua_browser_search.png")),
"search_typed": os.path.abspath(os.path.join(os.path.dirname(__file__), "./assets/cua_search_typed.png")),
"search_results": os.path.abspath(os.path.join(os.path.dirname(__file__), "./assets/cua_search_results.png")),
}

# Convert images to base64 data URLs with filenames
screenshots: dict[str, dict[str, str]] = {}
filename_map = {
"browser_search": "cua_browser_search.png",
"search_typed": "cua_search_typed.png",
"search_results": "cua_search_results.png",
}

for key, path in screenshot_paths.items():
try:
image_base64 = image_to_base64(path)
screenshots[key] = {"filename": filename_map[key], "url": f"data:image/png;base64,{image_base64}"}
Comment on lines +81 to +90
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The filename_map dictionary is redundant as it duplicates information already present in the screenshot_paths dictionary. The filenames can be extracted directly from the paths using os.path.basename(path).

Consider refactoring to:

for key, path in screenshot_paths.items():
    try:
        image_base64 = image_to_base64(path)
        screenshots[key] = {"filename": os.path.basename(path), "url": f"data:image/png;base64,{image_base64}"}
    except FileNotFoundError as e:
        print(f"Error: Missing required screenshot asset: {e}")
        raise

This eliminates the need to maintain two separate dictionaries with the same information.

Suggested change
filename_map = {
"browser_search": "cua_browser_search.png",
"search_typed": "cua_search_typed.png",
"search_results": "cua_search_results.png",
}
for key, path in screenshot_paths.items():
try:
image_base64 = image_to_base64(path)
screenshots[key] = {"filename": filename_map[key], "url": f"data:image/png;base64,{image_base64}"}
for key, path in screenshot_paths.items():
try:
image_base64 = image_to_base64(path)
screenshots[key] = {"filename": os.path.basename(path), "url": f"data:image/png;base64,{image_base64}"}

Copilot uses AI. Check for mistakes.
except FileNotFoundError as e:
print(f"Error: Missing required screenshot asset: {e}")
raise

return screenshots


def handle_computer_action_and_take_screenshot(
action: Action,
current_state: SearchState,
screenshots: dict[str, dict[str, str]],
):
"""Process a computer action and simulate its execution.

In a real implementation, you might want to execute real browser operations
instead of just printing, take screenshots, and return actual screenshot data.

Args:
action: The computer action to process (click, type, key press, etc.)
current_state: Current SearchState of the simulation
screenshots: Dictionary of screenshot data

Returns:
tuple: (screenshot_info, updated_current_state)
"""
print(f"Executing computer action: {action.type}")

# State transitions based on actions
if action.type == "type" and hasattr(action, "text") and action.text:
current_state = SearchState.TYPED
print(f" Typing text: '{action.text}' - Simulating keyboard input")

# Check for ENTER key press
elif action.type == "keypress" and action.keys and ("Return" in action.keys or "ENTER" in action.keys):
current_state = SearchState.PRESSED_ENTER
print(" -> Detected ENTER key press")

# Check for click after typing (alternative submit method)
elif action.type == "click" and current_state == SearchState.TYPED:
current_state = SearchState.PRESSED_ENTER
print(" -> Detected click after typing")

# Provide more realistic feedback based on action type
if hasattr(action, "x") and hasattr(action, "y"):
if action.type == "click":
print(f" Click at ({action.x}, {action.y}) - Simulating click on UI element")
elif action.type == "drag":
path_str = " -> ".join([f"({p.x}, {p.y})" for p in action.path])
print(f" Drag path: {path_str} - Simulating drag operation")
elif action.type == "scroll":
print(f" Scroll at ({action.x}, {action.y}) - Simulating scroll action")

if action.type == "keypress" and action.keys:
print(f" Key press: {action.keys} - Simulating key combination")

if action.type == "screenshot":
print(" Taking screenshot - Capturing current screen state")

print(f" -> Action processed: {action.type}")

# Determine screenshot based on current state
if current_state == SearchState.PRESSED_ENTER:
screenshot_info = screenshots["search_results"]
elif current_state == SearchState.TYPED:
screenshot_info = screenshots["search_typed"]
else: # SearchState.INITIAL
screenshot_info = screenshots["browser_search"]

return screenshot_info, current_state


def print_final_output(openai_response: Response) -> None:
"""Print the final output when the agent completes the task.

Args:
openai_response: The inner response object containing the agent's final output
"""
print("No computer calls found. Agent completed the task:")
final_output = ""
for item in openai_response.output:
if item.type == "message":
contents = item.content
for part in contents:
final_output += getattr(part, "text", None) or getattr(part, "refusal", None) or "" + "\n"
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incorrect operator precedence in string concatenation. The current expression getattr(part, "text", None) or getattr(part, "refusal", None) or "" + "\n" will evaluate as (... or "" + "\n") which adds "\n" to an empty string before the or operation, not to the final result.

This should be: (getattr(part, "text", None) or getattr(part, "refusal", None) or "") + "\n"

Add parentheses to ensure the newline is concatenated to the result of the or chain.

Suggested change
final_output += getattr(part, "text", None) or getattr(part, "refusal", None) or "" + "\n"
final_output += (getattr(part, "text", None) or getattr(part, "refusal", None) or "") + "\n"

Copilot uses AI. Check for mistakes.

print(f"Final status: {openai_response.status}")
print(f"Final output: {final_output.strip()}")


async def main():
"""Main async function to demonstrate Computer Use Agent functionality."""

async with (
AzureCliCredential() as credential,
AzureAIClient(
async_credential=credential,
model_deployment_name="computer-use-preview", # Computer use tool requires specific deployment
).create_agent(
name="ComputerUseAgent",
instructions="You are a computer automation assistant.",
tools=[ComputerUsePreviewTool(display_width=1026, display_height=769, environment="windows")],
) as agent,
):
print("Starting computer automation session (initial screenshot: cua_browser_search.png)...")

# Initialize state machine
current_state = SearchState.INITIAL

# Load screenshot assets
try:
screenshots = load_screenshot_assets()
print("Successfully loaded screenshot assets")
except FileNotFoundError:
print("Failed to load required screenshot assets. Please ensure the asset files exist in ../assets/")
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message references an incorrect path. The assets directory is located at ./assets/ relative to the script file (as shown in line 74), not at ../assets/.

The error message should be: "Failed to load required screenshot assets. Please ensure the asset files exist in ./assets/"

Suggested change
print("Failed to load required screenshot assets. Please ensure the asset files exist in ../assets/")
print("Failed to load required screenshot assets. Please ensure the asset files exist in ./assets/")

Copilot uses AI. Check for mistakes.
return

# Initial user message to start the search task
messages: list[ChatMessage] = [
ChatMessage(
role="user",
contents=[
TextContent(
"I need you to help me search for 'OpenAI news'. "
"Please type 'OpenAI news' and submit the search. "
"Once you see search results, the task is complete."
),
DataContent(uri=screenshots["browser_search"]["url"], media_type="image/png"),
],
)
]

agent_thread = agent.get_new_thread()
response = await agent.run(messages, thread=agent_thread, additional_chat_options={"truncation": "auto"})
print(f"Initial response received (ID: {response.response_id})")

# Main interaction loop with deterministic completion
max_iterations = 10 # Allow enough iterations for completion
iteration = 0

while True:
if iteration >= max_iterations:
print(f"\nReached maximum iterations ({max_iterations}). Stopping.")
break

iteration += 1
print(f"\n--- Iteration {iteration} ---")

assert response.raw_representation is not None
assert isinstance(response.raw_representation, ChatResponse)

chat_response = response.raw_representation
assert chat_response.raw_representation is not None
assert isinstance(chat_response.raw_representation, Response)

openai_response = chat_response.raw_representation
computer_calls = [
computer_call
for computer_call in openai_response.output
if isinstance(computer_call, ResponseComputerToolCall)
]

if not computer_calls:
print_final_output(openai_response)
break

# Only process the first computer call
computer_call = computer_calls[0]
action = computer_call.action
call_id = computer_call.call_id
print(f"Processing computer call ID: {call_id}, Action type: {action.type}")

# Handle the action and get the screenshot info
screenshot_info, current_state = handle_computer_action_and_take_screenshot(
action, current_state, screenshots
)

print(f"Sending action result back to agent (using {screenshot_info['filename']})...")
response = await agent.run(
ChatMessage(
role=Role.TOOL,
contents=[],
raw_representation={
"call_id": call_id,
"type": "computer_call_output",
"output": {
"type": "computer_screenshot",
"image_url": screenshot_info["url"],
},
},
),
thread=agent_thread,
additional_chat_options={"truncation": "auto"},
)

print(f"Follow-up response received (ID: {response.response_id})")


if __name__ == "__main__":
asyncio.run(main())