Skip to content

Commit a051c81

Browse files
authored
🤖 ci: improve integration test reliability (#1085)
## Summary Fix flaky integration tests for image handling and MCP screenshot functionality. ## Changes ### Image Test Fix (`sendMessage.images.test.ts`) - **Use 8-bit RGB PNGs instead of 1-bit indexed**: The original PNGs were using 1-bit colormap encoding which may not be properly processed by vision APIs. Now using explicit `-define png:color-type=2` for proper 8-bit per channel RGB encoding. - **Increase image size**: Changed from 1x1 to 4x4 pixels for more reliable vision model processing - **Better prompts**: Updated to explicitly describe the solid-color image and request just the color name - **Add debug logging**: Added logging when `sendMessage` fails to help diagnose future CI failures - **Both RED_PIXEL and BLUE_PIXEL fixtures updated** to use proper RGB encoding ### MCP Screenshot Test Fix (`mcpConfig.test.ts`) - **More directive prompts**: Changed prompts to explicitly specify the tool names that MUST be used (`chrome_navigate_page`, `chrome_take_screenshot`) - **Add diagnostic logging**: When screenshot tool call is missing, log which tools were actually called and the model response ## Root Cause Analysis The image test was failing because: 1. First retry: API call returns `success=false` (transient API issue) 2. Subsequent retries: API call succeeds but returns no text deltas (`deltas.length === 0`) Investigation revealed the PNG images were using 1-bit indexed colormap format instead of proper RGB, which may cause issues with vision API processing. _Generated with `mux`_
1 parent d45c0d5 commit a051c81

File tree

2 files changed

+36
-11
lines changed

2 files changed

+36
-11
lines changed

tests/ipc/mcpConfig.test.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,13 +199,14 @@ describeIntegration("MCP server integration with model", () => {
199199
const imageFormatCases = [
200200
{
201201
name: "PNG",
202-
prompt: "Navigate to https://example.com and take a screenshot. Describe what you see.",
202+
prompt:
203+
"You MUST use chrome_navigate_page to go to https://example.com, then MUST use chrome_take_screenshot to capture the page. After taking the screenshot, describe what you see in the image.",
203204
mediaTypePattern: /^image\//,
204205
},
205206
{
206207
name: "JPEG",
207208
prompt:
208-
'Navigate to https://example.com and take a screenshot in JPEG format (use format: "jpeg"). Describe what you see.',
209+
'You MUST use chrome_navigate_page to go to https://example.com, then MUST use chrome_take_screenshot with format "jpeg" to capture the page. After taking the screenshot, describe what you see in the image.',
209210
mediaTypePattern: /^image\/(jpeg|jpg|webp)$/,
210211
},
211212
] as const;
@@ -245,6 +246,15 @@ describeIntegration("MCP server integration with model", () => {
245246
(e): e is Extract<typeof e, { type: "tool-call-end" }> => e.type === "tool-call-end"
246247
);
247248
const screenshotResult = toolCallEnds.find((e) => e.toolName === "chrome_take_screenshot");
249+
250+
// Debug: log tool calls if screenshot not found
251+
if (!screenshotResult) {
252+
const toolNames = toolCallEnds.map((e) => e.toolName);
253+
const deltas = collector.getDeltas();
254+
const responseText = extractTextFromEvents(deltas);
255+
console.log(`[MCP ${name} Test] Tool calls made:`, toolNames);
256+
console.log(`[MCP ${name} Test] Model response:`, responseText.slice(0, 500));
257+
}
248258
expect(screenshotResult).toBeDefined();
249259

250260
// Validate result structure and media content

tests/ipc/sendMessage.images.test.ts

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,17 @@ if (shouldRunIntegrationTests()) {
2525
validateApiKeys(["OPENAI_API_KEY", "ANTHROPIC_API_KEY"]);
2626
}
2727

28-
// 1x1 red PNG pixel as base64 data URI
28+
// 4x4 pure red PNG (#FF0000) as base64 data URI
29+
// Uses 8-bit RGB color (not indexed) for reliable vision model processing
2930
const RED_PIXEL = {
30-
url: "",
31+
url: "",
3132
mediaType: "image/png" as const,
3233
};
3334

34-
// 1x1 blue PNG pixel as base64 data URI
35+
// 4x4 pure blue PNG (#0000FF) as base64 data URI
36+
// Uses 8-bit RGB color (not indexed) for reliable vision model processing
3537
const BLUE_PIXEL = {
36-
url: "",
38+
url: "",
3739
mediaType: "image/png" as const,
3840
};
3941

@@ -58,11 +60,20 @@ describeIntegration("sendMessage image handling tests", () => {
5860

5961
await withSharedWorkspace(provider, async ({ env, workspaceId, collector }) => {
6062
// Send message with image attachment
61-
const result = await sendMessage(env, workspaceId, "What color is this?", {
62-
model: modelString(provider, model),
63-
imageParts: [RED_PIXEL],
64-
});
63+
const result = await sendMessage(
64+
env,
65+
workspaceId,
66+
"This is a small solid-color image. What color is it? Answer with just the color name.",
67+
{
68+
model: modelString(provider, model),
69+
imageParts: [RED_PIXEL],
70+
}
71+
);
6572

73+
// Debug: log if sendMessage failed
74+
if (!result.success) {
75+
console.log(`[Image Test] sendMessage failed:`, JSON.stringify(result, null, 2));
76+
}
6677
expect(result.success).toBe(true);
6778

6879
// Wait for stream to complete
@@ -81,7 +92,7 @@ describeIntegration("sendMessage image handling tests", () => {
8192
// Should mention red color in some form
8293
expect(fullResponse.length).toBeGreaterThan(0);
8394
// Red pixel should be detected (flexible matching as different models may phrase differently)
84-
expect(fullResponse).toMatch(/red|color|orange/i);
95+
expect(fullResponse).toMatch(/red/i);
8596
});
8697
},
8798
40000 // Vision models can be slower
@@ -100,6 +111,10 @@ describeIntegration("sendMessage image handling tests", () => {
100111
imageParts: [RED_PIXEL, BLUE_PIXEL],
101112
});
102113

114+
// Debug: log if sendMessage failed
115+
if (!result.success) {
116+
console.log(`[Image Test Multi] sendMessage failed:`, JSON.stringify(result, null, 2));
117+
}
103118
expect(result.success).toBe(true);
104119

105120
// Wait for stream to complete

0 commit comments

Comments
 (0)