Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 71 additions & 11 deletions packages/core/lib/v3/agent/AnthropicCUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ export class AnthropicCUAClient extends AgentClient {
private currentViewport = { width: 1288, height: 711 };
private currentUrl?: string;
private screenshotProvider?: () => Promise<string>;
private zoomedScreenshotProvider?: (region: number[]) => Promise<string>;
private actionHandler?: (action: AgentAction) => Promise<void>;
private thinkingBudget: number | null = null;
private tools?: ToolSet;
Expand Down Expand Up @@ -95,6 +96,10 @@ export class AnthropicCUAClient extends AgentClient {
this.screenshotProvider = provider;
}

setZoomedScreenshotProvider(provider: (region: number[]) => Promise<string>): void {
this.zoomedScreenshotProvider = provider;
}

setActionHandler(handler: (action: AgentAction) => Promise<void>): void {
this.actionHandler = handler;
}
Expand Down Expand Up @@ -454,20 +459,27 @@ export class AnthropicCUAClient extends AgentClient {
? "computer-use-2025-11-24"
: "computer-use-2025-01-24";

// Create the computer tool definition
// For models using computer_20251124, enable the zoom capability
const computerToolDef: Record<string, unknown> = {
type: computerToolType,
name: "computer",
display_width_px: this.currentViewport.width,
display_height_px: this.currentViewport.height,
display_number: 1,
};

// Enable zoom for models that support it (computer_20251124)
if (shouldUseNewToolVersion) {
computerToolDef.enable_zoom = true;
}

// Create the request parameters
const requestParams: Record<string, unknown> = {
model: this.modelName,
max_tokens: 4096,
messages: messages,
tools: [
{
type: computerToolType,
name: "computer",
display_width_px: this.currentViewport.width,
display_height_px: this.currentViewport.height,
display_number: 1,
},
],
tools: [computerToolDef],
betas: [betaFlag],
};

Expand Down Expand Up @@ -589,8 +601,20 @@ export class AnthropicCUAClient extends AgentClient {
level: 2,
});

// Capture a screenshot for the response
const screenshot = await this.captureScreenshot();
// For zoom action, capture a cropped screenshot of the specified region
// For other actions, capture a full screenshot
let screenshot: string;
if (action === "zoom" && item.input.region) {
const region = item.input.region as number[];
logger({
category: "agent",
message: `Zoom action requested for region: [${region.join(", ")}]`,
level: 2,
});
screenshot = await this.captureZoomedScreenshot(region);
} else {
screenshot = await this.captureScreenshot();
}
logger({
category: "agent",
message: `Screenshot captured, length: ${screenshot.length}`,
Expand Down Expand Up @@ -916,6 +940,14 @@ export class AnthropicCUAClient extends AgentClient {
type: "wait",
...input,
};
} else if (action === "zoom") {
// Handle zoom action - returns a cropped region at full resolution
const region = input.region as number[] | undefined;
return {
type: "zoom",
region: region,
...input,
};
} else if (action === "left_click") {
// Convert left_click to regular click
const coordinates = input.coordinate as number[] | undefined;
Expand Down Expand Up @@ -979,4 +1011,32 @@ export class AnthropicCUAClient extends AgentClient {
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
);
}

/**
* Capture a zoomed screenshot of a specific region at full resolution.
* The region is defined by [x1, y1, x2, y2] coordinates (top-left and bottom-right corners).
*
* @param region - Array of [x1, y1, x2, y2] coordinates defining the region to capture
* @returns A data URL with the base64-encoded image of the cropped region
*/
async captureZoomedScreenshot(region: number[]): Promise<string> {
// Use the zoomed screenshot provider if available
if (this.zoomedScreenshotProvider) {
try {
const base64Image = await this.zoomedScreenshotProvider(region);
// Handle both raw base64 and data URLs
if (base64Image.startsWith("data:")) {
return base64Image;
}
return `data:image/png;base64,${base64Image}`;
} catch (error) {
console.error("Error capturing zoomed screenshot:", error);
throw error;
}
}

// Fall back to regular screenshot if no zoomed screenshot provider is set
// The caller should handle the zoom/crop on their end
return this.captureScreenshot();
}
}
26 changes: 26 additions & 0 deletions packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { V3 } from "../v3.js";
import { ToolSet } from "ai";
import { AgentClient } from "../agent/AgentClient.js";
import { AgentProvider } from "../agent/AgentProvider.js";
import { AnthropicCUAClient } from "../agent/AnthropicCUAClient.js";
import { GoogleCUAClient } from "../agent/GoogleCUAClient.js";
import { OpenAICUAClient } from "../agent/OpenAICUAClient.js";
import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js";
Expand Down Expand Up @@ -79,6 +80,26 @@ export class V3CuaAgentHandler {
return screenshotBuffer.toString("base64"); // base64 png
});

// Provide zoomed screenshot provider for Anthropic CUA zoom tool
// This captures a specific region of the screen at full resolution using CDP's clip parameter
if (this.agentClient instanceof AnthropicCUAClient) {
this.agentClient.setZoomedScreenshotProvider(async (region: number[]) => {
this.ensureNotClosed();
const page = await this.v3.context.awaitActivePage();
const [x1, y1, x2, y2] = region;
const screenshotBuffer = await page.screenshot({
fullPage: false,
clip: {
x: x1,
y: y1,
width: x2 - x1,
height: y2 - y1,
},
});
return screenshotBuffer.toString("base64");
});
}

// Provide action executor
this.agentClient.setActionHandler(async (action) => {
this.ensureNotClosed();
Expand Down Expand Up @@ -506,6 +527,11 @@ export class V3CuaAgentHandler {
// No-op - screenshot is captured by captureAndSendScreenshot() after all actions
return { success: true };
}
case "zoom": {
// No-op here - the zoomed screenshot is captured by the AnthropicCUAClient's
// takeAction() method via captureZoomedScreenshot(), not via the action handler.
return { success: true };
}
case "goto": {
const { url } = action;
await page.goto(String(url ?? ""), { waitUntil: "load" });
Expand Down
221 changes: 221 additions & 0 deletions packages/core/tests/unit/anthropic-cua-client-zoom.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
import { describe, expect, it, vi, beforeEach } from "vitest";
import { AnthropicCUAClient } from "../../lib/v3/agent/AnthropicCUAClient.js";

// Helper to create a client with a specific model
function createClient(modelName: string = "claude-sonnet-4-6") {
return new AnthropicCUAClient(
"anthropic",
modelName,
undefined,
{ apiKey: "test-key" },
);
}

describe("AnthropicCUAClient zoom tool", () => {
describe("tool definition", () => {
it("includes enable_zoom: true for models using computer_20251124", async () => {
const client = createClient("claude-sonnet-4-6");

// Access the private method through type casting
const getActionMethod = (client as unknown as {
getAction: (inputItems: unknown[]) => Promise<{ content: unknown[]; id: string; usage: Record<string, number> }>;
}).getAction.bind(client);

// Mock the Anthropic client to capture the request params
let capturedParams: Record<string, unknown> | null = null;
const mockCreate = vi.fn().mockImplementation((params) => {
capturedParams = params;
return Promise.resolve({
id: "test-id",
content: [{ type: "text", text: "test response" }],
usage: { input_tokens: 10, output_tokens: 5 },
});
});

// Replace the client's internal Anthropic client
(client as unknown as { client: { beta: { messages: { create: typeof mockCreate } } } }).client = {
beta: {
messages: {
create: mockCreate,
},
},
};

// Make a request
await getActionMethod([{ role: "user", content: "test" }]);

// Verify the tool definition includes enable_zoom: true
expect(capturedParams).not.toBeNull();
const tools = capturedParams!.tools as Array<{ type: string; name: string; enable_zoom?: boolean }>;
const computerTool = tools.find(t => t.name === "computer");
expect(computerTool).toBeDefined();
expect(computerTool!.type).toBe("computer_20251124");
expect(computerTool!.enable_zoom).toBe(true);
});

it("does NOT include enable_zoom for models using computer_20250124", async () => {
// Use a model that requires the older tool version
const client = createClient("claude-sonnet-4-20250514");

let capturedParams: Record<string, unknown> | null = null;
const mockCreate = vi.fn().mockImplementation((params) => {
capturedParams = params;
return Promise.resolve({
id: "test-id",
content: [{ type: "text", text: "test response" }],
usage: { input_tokens: 10, output_tokens: 5 },
});
});

(client as unknown as { client: { beta: { messages: { create: typeof mockCreate } } } }).client = {
beta: {
messages: {
create: mockCreate,
},
},
};

const getActionMethod = (client as unknown as {
getAction: (inputItems: unknown[]) => Promise<{ content: unknown[]; id: string; usage: Record<string, number> }>;
}).getAction.bind(client);

await getActionMethod([{ role: "user", content: "test" }]);

const tools = capturedParams!.tools as Array<{ type: string; name: string; enable_zoom?: boolean }>;
const computerTool = tools.find(t => t.name === "computer");
expect(computerTool).toBeDefined();
expect(computerTool!.type).toBe("computer_20250124");
expect(computerTool!.enable_zoom).toBeUndefined();
});
});

describe("convertToolUseToAction", () => {
it("converts zoom tool use to a zoom action with region", () => {
const client = createClient();

const convertMethod = (client as unknown as {
convertToolUseToAction: (item: { name: string; input: Record<string, unknown> }) => { type: string; region?: number[] } | null;
}).convertToolUseToAction.bind(client);

const toolUseItem = {
name: "computer",
input: {
action: "zoom",
region: [100, 200, 400, 350],
},
};

const action = convertMethod(toolUseItem);

expect(action).not.toBeNull();
expect(action!.type).toBe("zoom");
expect(action!.region).toEqual([100, 200, 400, 350]);
});
});

describe("takeAction with zoom", () => {
it("captures a cropped screenshot for the specified region", async () => {
const client = createClient();

// Mock screenshot provider to return a full screenshot
const mockScreenshot = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==";
client.setScreenshotProvider(async () => mockScreenshot);

// Mock the cropped screenshot capture method
let capturedRegion: number[] | undefined;
const mockCaptureZoomedScreenshot = vi.fn().mockImplementation(async (region: number[]) => {
capturedRegion = region;
return `data:image/png;base64,${mockScreenshot}`;
});

// Set up the mock for captureZoomedScreenshot
(client as unknown as { captureZoomedScreenshot: typeof mockCaptureZoomedScreenshot }).captureZoomedScreenshot = mockCaptureZoomedScreenshot;

const takeActionMethod = (client as unknown as {
takeAction: (
toolUseItems: Array<{ id: string; name: string; input: Record<string, unknown> }>,
logger: (msg: { category: string; message: string; level: number }) => void,
) => Promise<Array<{ type: string; tool_use_id: string; content: unknown[] }>>;
}).takeAction.bind(client);

const toolUseItems = [
{
id: "tool-use-1",
name: "computer",
input: {
action: "zoom",
region: [100, 200, 400, 350],
},
},
];

const results = await takeActionMethod(toolUseItems, vi.fn());

// Verify that captureZoomedScreenshot was called with the correct region
expect(mockCaptureZoomedScreenshot).toHaveBeenCalledWith([100, 200, 400, 350]);
expect(capturedRegion).toEqual([100, 200, 400, 350]);

// Verify the result contains an image
expect(results).toHaveLength(1);
expect(results[0].type).toBe("tool_result");
expect(results[0].tool_use_id).toBe("tool-use-1");

const imageContent = results[0].content.find(
(c: { type: string }) => c.type === "image"
);
expect(imageContent).toBeDefined();
});

it("falls back to regular screenshot when zoomedScreenshotProvider is not set", async () => {
const client = createClient();

// Only set the regular screenshot provider
const mockScreenshot = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==";
client.setScreenshotProvider(async () => mockScreenshot);

const takeActionMethod = (client as unknown as {
takeAction: (
toolUseItems: Array<{ id: string; name: string; input: Record<string, unknown> }>,
logger: (msg: { category: string; message: string; level: number }) => void,
) => Promise<Array<{ type: string; tool_use_id: string; content: unknown[] }>>;
}).takeAction.bind(client);

const toolUseItems = [
{
id: "tool-use-1",
name: "computer",
input: {
action: "zoom",
region: [100, 200, 400, 350],
},
},
];

// Should not throw, should return a result with an image
const results = await takeActionMethod(toolUseItems, vi.fn());

expect(results).toHaveLength(1);
expect(results[0].type).toBe("tool_result");

// Should have image content from the regular screenshot
const imageContent = results[0].content.find(
(c: { type: string }) => c.type === "image"
);
expect(imageContent).toBeDefined();
});
});

describe("setZoomedScreenshotProvider", () => {
it("allows setting a custom zoomed screenshot provider", () => {
const client = createClient();

const mockProvider = vi.fn().mockResolvedValue("base64-image");

// This method should exist on the client
expect(typeof client.setZoomedScreenshotProvider).toBe("function");

// Should not throw
client.setZoomedScreenshotProvider(mockProvider);
});
});
});
Loading
Loading