diff --git a/docs/ios_setup/ios_setup.md b/docs/ios_setup/ios_setup.md index a2b22fee..f8b25c46 100644 --- a/docs/ios_setup/ios_setup.md +++ b/docs/ios_setup/ios_setup.md @@ -10,7 +10,6 @@ - iOS 设备(iPhone/iPad) - USB 数据线或同一 WiFi 网络 - ## WebDriverAgent 配置 WebDriverAgent 是 iOS 自动化的核心组件,需要在 iOS 设备上运行。 @@ -28,7 +27,7 @@ cd WebDriverAgent 1. 在 Xcode 中选中 `WebDriverAgent`,出现General、Signing&Capabilities等选项。 2. 进入 `Signing & Capabilities` 选项卡 -3. 勾选 `Automatically manage signing`。在Team中选择自己的开发者账号 +3. 勾选 `Automatically manage signing`。在Team中选择自己的开发者账号 4. 将 Bundle ID 改为唯一标识符,例如:`com.yourname.WebDriverAgentRunner` ![设置签名1](resources/ios0_WebDriverAgent0.png) @@ -43,7 +42,7 @@ Mac和iPhone有USB和WiFi两种连接方式,建议通过USB方式,成功率 #### 通过 WiFi 连接 需要满足以下条件: -1. 通过USB连接。在Finder中选中连接的IPhone,在“通用”中勾选"在 WiFi 中显示这台 iPhone" +1. 通过USB连接。在Finder中选中连接的IPhone,在“通用”中勾选"在 WiFi 中显示这台 iPhone" 2. Mac 与 iPhone 处于同一 WiFi 网络之下 #### 具体步骤 @@ -52,7 +51,7 @@ Mac和iPhone有USB和WiFi两种连接方式,建议通过USB方式,成功率 ![选择设备](resources/select-your-iphone-device.png) -3. 长按"▶️"运行按钮,选择 "Test" 后开始编译并部署到你的 iPhone 上 +1. 长按"▶️"运行按钮,选择 "Test" 后开始编译并部署到你的 iPhone 上 ![开始测试](resources/start-wda-testing.png) @@ -85,6 +84,7 @@ brew install libimobiledevice # 设备检查 idevice_id -ln ``` + 2.使用xcodebuild安装WebAgent。命令行也需要进行“设备信任配置”,参考GUI模式下的方法。 ``` @@ -95,6 +95,7 @@ xcodebuild -project WebDriverAgent.xcodeproj \ -destination 'platform=iOS,name=YOUR_PHONE_NAME' \ test ``` + 这里,YOUR_PHONE_NAME可以在xcode的GUI中看到。 WebDriverAgent 成功运行后,会在 Xcode 控制台输出类似以下信息: diff --git a/ios.py b/ios.py index 78dfc5ea..521d64f5 100755 --- a/ios.py +++ b/ios.py @@ -10,7 +10,7 @@ PHONE_AGENT_MODEL: Model name (default: autoglm-phone-9b) PHONE_AGENT_MAX_STEPS: Maximum steps per task (default: 100) PHONE_AGENT_WDA_URL: WebDriverAgent URL (default: http://localhost:8100) - PHONE_AGENT_DEVICE_ID: iOS device UDID for multi-device setups + PHONE_AGENT_DEVICE_ID: iOS device UUID for multi-device setups """ import argparse @@ -99,9 +99,7 @@ def check_system_requirements(wda_url: str = "http://localhost:8100") -> bool: print(" 4. Or connect via WiFi using device IP") all_passed = False else: - device_names = [ - d.device_name or d.device_id[:8] + "..." for d in devices - ] + device_names = [d.device_name or d.device_id[:8] + "..." for d in devices] print(f"✅ OK ({len(devices)} device(s): {', '.join(device_names)})") except Exception as e: print("❌ FAILED") @@ -261,7 +259,7 @@ def parse_args() -> argparse.Namespace: python ios.py --base-url http://localhost:8000/v1 # Run with specific device - python ios.py --device-id + python ios.py --device-id # Use WiFi connection python ios.py --wda-url http://192.168.1.100:8100 @@ -315,7 +313,7 @@ def parse_args() -> argparse.Namespace: "-d", type=str, default=os.getenv("PHONE_AGENT_DEVICE_ID"), - help="iOS device UDID", + help="iOS device UUID", ) parser.add_argument( @@ -326,7 +324,9 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( - "--list-devices", action="store_true", help="List connected iOS devices and exit" + "--list-devices", + action="store_true", + help="List connected iOS devices and exit", ) parser.add_argument( @@ -396,7 +396,7 @@ def handle_device_commands(args) -> bool: name_info = device.device_name or "Unnamed" print(f" ✓ {name_info}") - print(f" UDID: {device.device_id}") + print(f" UUID: {device.device_id}") print(f" Model: {model_info}") print(f" OS: {ios_info}") print(f" Connection: {conn_type}") @@ -474,9 +474,7 @@ def main(): # Create configurations model_config = ModelConfig( - base_url=args.base_url, - model_name=args.model, - api_key=args.api_key + base_url=args.base_url, model_name=args.model, api_key=args.api_key ) agent_config = IOSAgentConfig( diff --git a/phone_agent/actions/handler.py b/phone_agent/actions/handler.py index 0bef1c3a..a54850df 100644 --- a/phone_agent/actions/handler.py +++ b/phone_agent/actions/handler.py @@ -265,7 +265,7 @@ def _send_keyevent(self, keycode: str) -> None: # Handle HDC devices with HarmonyOS-specific keyEvent command if device_factory.device_type == DeviceType.HDC: hdc_prefix = ["hdc", "-t", self.device_id] if self.device_id else ["hdc"] - + # Map common keycodes to HarmonyOS keyEvent codes # KEYCODE_ENTER (66) -> 2054 (HarmonyOS Enter key code) if keycode == "KEYCODE_ENTER" or keycode == "66": @@ -283,7 +283,8 @@ def _send_keyevent(self, keycode: str) -> None: # For now, only handle ENTER, other keys may need mapping if "ENTER" in keycode: _run_hdc_command( - hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2054"], + hdc_prefix + + ["shell", "uitest", "uiInput", "keyEvent", "2054"], capture_output=True, text=True, ) @@ -297,7 +298,8 @@ def _send_keyevent(self, keycode: str) -> None: else: # Assume it's a numeric code _run_hdc_command( - hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", str(keycode)], + hdc_prefix + + ["shell", "uitest", "uiInput", "keyEvent", str(keycode)], capture_output=True, text=True, ) @@ -342,22 +344,101 @@ def parse_action(response: str) -> dict[str, Any]: Raises: ValueError: If the response cannot be parsed. """ - print(f"Parsing action: {response}") + if not response or not response.strip(): + return {"_metadata": "finish", "message": "Model returned an empty action."} + try: response = response.strip() + + # 1. Try to extract do(...) or finish(...) using regex if it's wrapped in other text + do_match = re.search(r"do\(.*?\)", response, re.DOTALL) + finish_match = re.search(r"finish\(.*?\)", response, re.DOTALL) + + if do_match: + action_str = do_match.group(0) + # Special handling for Type action with text that might contain special characters + if 'action="Type"' in action_str or 'action="Type_Name"' in action_str: + if "text=" in action_str: + try: + # Try to extract text between quotes more robustly + text_part = action_str.split("text=", 1)[1] + # Find the first and last quote + first_quote = text_part.find('"') + last_quote = text_part.rfind('"') + if ( + first_quote != -1 + and last_quote != -1 + and first_quote < last_quote + ): + text = text_part[first_quote + 1 : last_quote] + # Extract action type + action_type = ( + "Type" if 'action="Type"' in action_str else "Type_Name" + ) + return { + "_metadata": "do", + "action": action_type, + "text": text, + } + except Exception: + pass # Fallback to AST if regex fails + + # Standard do(...) parsing using AST + try: + # Clean up the string for AST + clean_str = ( + action_str.replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t") + ) + tree = ast.parse(clean_str, mode="eval") + if isinstance(tree.body, ast.Call): + call = tree.body + action = {"_metadata": "do"} + for keyword in call.keywords: + key = keyword.arg + try: + value = ast.literal_eval(keyword.value) + action[key] = value + except (ValueError, SyntaxError): + # Fallback for non-literal values (though model should only output literals) + if isinstance(keyword.value, ast.Constant): + action[key] = keyword.value.value + else: + # Last resort: raw string representation + action[key] = str(keyword.value) + return action + except (SyntaxError, ValueError) as e: + print(f"AST parsing failed for {action_str}: {e}") + # If it's a simple do(action="Home") but AST failed, try one more manual parse + if 'action="Home"' in action_str: + return {"_metadata": "do", "action": "Home"} + + if finish_match: + action_str = finish_match.group(0) + # Simple extraction for finish(message="...") + message = "" + if 'message="' in action_str: + parts = action_str.split('message="', 1)[1].rsplit('"', 1) + if len(parts) >= 1: + message = parts[0] + elif "message='" in action_str: + parts = action_str.split("message='", 1)[1].rsplit("'", 1) + if len(parts) >= 1: + message = parts[0] + + return {"_metadata": "finish", "message": message} + + # Legacy/Fallback behavior if response.startswith('do(action="Type"') or response.startswith( 'do(action="Type_Name"' ): - text = response.split("text=", 1)[1][1:-2] - action = {"_metadata": "do", "action": "Type", "text": text} - return action - elif response.startswith("do"): # Use AST parsing instead of eval for safety try: # Escape special characters (newlines, tabs, etc.) for valid Python syntax - response = response.replace('\n', '\\n') - response = response.replace('\r', '\\r') - response = response.replace('\t', '\\t') + response = response.replace("\n", "\\n") + response = response.replace("\r", "\\r") + response = response.replace("\t", "\\t") tree = ast.parse(response, mode="eval") if not isinstance(tree.body, ast.Call): @@ -381,10 +462,16 @@ def parse_action(response: str) -> dict[str, Any]: "message": response.replace("finish(message=", "")[1:-2], } else: - raise ValueError(f"Failed to parse action: {response}") + # If all parsing attempts fail, treat the entire response as a message for 'finish' + # This is more robust than crashing with a ValueError + action = {"_metadata": "finish", "message": response} return action except Exception as e: - raise ValueError(f"Failed to parse action: {e}") + # Final fallback: return the original response if possible + return { + "_metadata": "finish", + "message": f"Parsing failed: {str(e)}. Raw response: {response}", + } def do(**kwargs) -> dict[str, Any]: diff --git a/phone_agent/actions/handler_ios.py b/phone_agent/actions/handler_ios.py index c37f50d9..11c977b3 100644 --- a/phone_agent/actions/handler_ios.py +++ b/phone_agent/actions/handler_ios.py @@ -129,9 +129,7 @@ def _handle_launch(self, action: dict, width: int, height: int) -> ActionResult: if not app_name: return ActionResult(False, False, "No app name specified") - success = launch_app( - app_name, wda_url=self.wda_url, session_id=self.session_id - ) + success = launch_app(app_name, wda_url=self.wda_url, session_id=self.session_id) if success: return ActionResult(True, False) return ActionResult(False, False, f"App not found: {app_name}") diff --git a/phone_agent/adb/connection.py b/phone_agent/adb/connection.py index b723ca91..a132821f 100644 --- a/phone_agent/adb/connection.py +++ b/phone_agent/adb/connection.py @@ -109,7 +109,9 @@ def disconnect(self, address: str | None = None) -> tuple[bool, str]: if address: cmd.append(address) - result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5) + result = subprocess.run( + cmd, capture_output=True, text=True, encoding="utf-8", timeout=5 + ) output = result.stdout + result.stderr return True, output.strip() or "Disconnected" @@ -241,7 +243,9 @@ def enable_tcpip( cmd.extend(["-s", device_id]) cmd.extend(["tcpip", str(port)]) - result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=10) + result = subprocess.run( + cmd, capture_output=True, text=True, encoding="utf-8", timeout=10 + ) output = result.stdout + result.stderr @@ -270,7 +274,9 @@ def get_device_ip(self, device_id: str | None = None) -> str | None: cmd.extend(["-s", device_id]) cmd.extend(["shell", "ip", "route"]) - result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5) + result = subprocess.run( + cmd, capture_output=True, text=True, encoding="utf-8", timeout=5 + ) # Parse IP from route output for line in result.stdout.split("\n"): diff --git a/phone_agent/adb/device.py b/phone_agent/adb/device.py index 995336a1..81a21616 100644 --- a/phone_agent/adb/device.py +++ b/phone_agent/adb/device.py @@ -22,7 +22,10 @@ def get_current_app(device_id: str | None = None) -> str: adb_prefix = _get_adb_prefix(device_id) result = subprocess.run( - adb_prefix + ["shell", "dumpsys", "window"], capture_output=True, text=True, encoding="utf-8" + adb_prefix + ["shell", "dumpsys", "window"], + capture_output=True, + text=True, + encoding="utf-8", ) output = result.stdout if not output: diff --git a/phone_agent/agent.py b/phone_agent/agent.py index 36427917..d65405a8 100644 --- a/phone_agent/agent.py +++ b/phone_agent/agent.py @@ -1,8 +1,10 @@ """Main PhoneAgent class for orchestrating phone automation.""" import json +import os import traceback from dataclasses import dataclass +from datetime import datetime from typing import Any, Callable from phone_agent.actions import ActionHandler @@ -171,9 +173,6 @@ def _execute_step( # Get model response try: msgs = get_messages(self.agent_config.lang) - print("\n" + "=" * 50) - print(f"💭 {msgs['thinking']}:") - print("-" * 50) response = self.model_client.request(self._context) except Exception as e: if self.agent_config.verbose: @@ -195,8 +194,7 @@ def _execute_step( action = finish(message=response.action) if self.agent_config.verbose: - # Print thinking process - print("-" * 50) + # Print parsed action print(f"🎯 {msgs['action']}:") print(json.dumps(action, ensure_ascii=False, indent=2)) print("=" * 50 + "\n") diff --git a/phone_agent/agent_ios.py b/phone_agent/agent_ios.py index a3b20d9f..7cbe9cec 100644 --- a/phone_agent/agent_ios.py +++ b/phone_agent/agent_ios.py @@ -20,7 +20,7 @@ class IOSAgentConfig: max_steps: int = 100 wda_url: str = "http://localhost:8100" session_id: str | None = None - device_id: str | None = None # iOS device UDID + device_id: str | None = None # iOS device UUID lang: str = "cn" system_prompt: str | None = None verbose: bool = True @@ -214,13 +214,8 @@ def _execute_step( action = finish(message=response.action) if self.agent_config.verbose: - # Print thinking process + # Print parsed action msgs = get_messages(self.agent_config.lang) - print("\n" + "=" * 50) - print(f"💭 {msgs['thinking']}:") - print("-" * 50) - print(response.thinking) - print("-" * 50) print(f"🎯 {msgs['action']}:") print(json.dumps(action, ensure_ascii=False, indent=2)) print("=" * 50 + "\n") diff --git a/phone_agent/config/apps.py b/phone_agent/config/apps.py index 874592da..0db93e15 100644 --- a/phone_agent/config/apps.py +++ b/phone_agent/config/apps.py @@ -29,7 +29,7 @@ "12306": "com.MobileTicket", "去哪儿": "com.Qunar", "去哪儿旅行": "com.Qunar", - "滴滴出行": "com.sdu.didi.psnger", + "滴滴出行": "com.sdu.did.psnger", # Video & Entertainment "bilibili": "tv.danmaku.bili", "抖音": "com.ss.android.ugc.aweme", @@ -224,4 +224,4 @@ def list_supported_apps() -> list[str]: Returns: List of app names. """ - return list(APP_PACKAGES.keys()) \ No newline at end of file + return list(APP_PACKAGES.keys()) diff --git a/phone_agent/config/apps_harmonyos.py b/phone_agent/config/apps_harmonyos.py index 5ffb52c5..76018a7f 100644 --- a/phone_agent/config/apps_harmonyos.py +++ b/phone_agent/config/apps_harmonyos.py @@ -21,7 +21,6 @@ "com.tencent.videohm": "AppAbility", "com.ximalaya.ting.xmharmony": "MainBundleAbility", "com.zhihu.hmos": "PhoneAbility", - # Huawei system apps "com.huawei.hmos.browser": "MainAbility", "com.huawei.hmos.calculator": "com.huawei.hmos.calculator.CalculatorAbility", @@ -40,7 +39,6 @@ "com.huawei.hmos.soundrecorder": "MainAbility", "com.huawei.hmos.vassistant": "AiCaptionServiceExtAbility", "com.huawei.hmos.wallet": "MainAbility", - # Huawei services "com.huawei.hmsapp.appgallery": "MainAbility", "com.huawei.hmsapp.books": "MainAbility", @@ -49,7 +47,6 @@ "com.huawei.hmsapp.music": "MainAbility", "com.huawei.hmsapp.thememanager": "MainAbility", "com.huawei.hmsapp.totemweather": "com.huawei.hmsapp.totemweather.MainAbility", - # OHOS system apps "com.ohos.callui": "com.ohos.callui.ServiceAbility", "com.ohos.contacts": "com.ohos.contacts.MainAbility", @@ -85,7 +82,7 @@ "12306": "com.chinarailway.ticketingHM", # "去哪儿": "com.Qunar", # 未在 hdc 列表中找到 # "去哪儿旅行": "com.Qunar", # 未在 hdc 列表中找到 - "滴滴出行": "com.sdu.didi.hmos.psnger", + "滴滴出行": "com.sdu.did.hmos.psnger", # Video & Entertainment "bilibili": "yylx.danmaku.bili", "抖音": "com.ss.hm.ugc.aweme", @@ -124,7 +121,6 @@ # "星穹铁道": "com.miHoYo.hkrpg", # 未在 hdc 列表中找到 # "崩坏:星穹铁道": "com.miHoYo.hkrpg", # 未在 hdc 列表中找到 # "恋与深空": "com.papegames.lysk.cn", # 未在 hdc 列表中找到 - # HarmonyOS 第三方应用 "百度": "com.baidu.baiduapp", "阿里巴巴": "com.alibaba.wireless_hmos", @@ -149,7 +145,6 @@ "国家税务总局": "cn.gov.chinatax.gt4.hm", "建设银行": "com.ccb.mobilebank.hm", "快手极速版": "com.kuaishou.hmnebula", - # HarmonyOS 系统应用 - 工具类 "浏览器": "com.huawei.hmos.browser", "计算器": "com.huawei.hmos.calculator", @@ -169,12 +164,10 @@ "截屏": "com.huawei.hmos.screenshot", "笔记": "com.huawei.hmos.notepad", "备忘录": "com.huawei.hmos.notepad", - # HarmonyOS 系统应用 - 媒体类 "相册": "com.huawei.hmos.photos", "图库": "com.huawei.hmos.photos", # "视频": "com.huawei.hmos.mediaplayer", # 未在 hdc 列表中找到,但有 com.huawei.hmsapp.himovie - # HarmonyOS 系统应用 - 通讯类 "联系人": "com.ohos.contacts", "通讯录": "com.ohos.contacts", @@ -182,7 +175,6 @@ "信息": "com.ohos.mms", "电话": "com.ohos.callui", "拨号": "com.ohos.callui", - # HarmonyOS 系统应用 - 设置类 "设置": "com.huawei.hmos.settings", "系统设置": "com.huawei.hmos.settings", @@ -191,7 +183,6 @@ "Android System Settings": "com.huawei.hmos.settings", "Android-System-Settings": "com.huawei.hmos.settings", "Settings": "com.huawei.hmos.settings", - # HarmonyOS 系统应用 - 生活服务 "健康": "com.huawei.hmos.health", "运动健康": "com.huawei.hmos.health", @@ -202,7 +193,6 @@ "智慧生活": "com.huawei.hmos.ailife", "智能助手": "com.huawei.hmos.vassistant", "小艺": "com.huawei.hmos.vassistant", - # HarmonyOS 服务 "应用市场": "com.huawei.hmsapp.appgallery", "华为应用市场": "com.huawei.hmsapp.appgallery", diff --git a/phone_agent/hdc/__init__.py b/phone_agent/hdc/__init__.py index 9b06993f..59c68d17 100644 --- a/phone_agent/hdc/__init__.py +++ b/phone_agent/hdc/__init__.py @@ -1,9 +1,9 @@ """HDC utilities for HarmonyOS device interaction.""" from phone_agent.hdc.connection import ( - HDCConnection, ConnectionType, DeviceInfo, + HDCConnection, list_devices, quick_connect, set_hdc_verbose, diff --git a/phone_agent/hdc/connection.py b/phone_agent/hdc/connection.py index 15809f84..d1a29899 100644 --- a/phone_agent/hdc/connection.py +++ b/phone_agent/hdc/connection.py @@ -9,7 +9,6 @@ from phone_agent.config.timing import TIMING_CONFIG - # Global flag to control HDC command output _HDC_VERBOSE = os.getenv("HDC_VERBOSE", "false").lower() in ("true", "1", "yes") @@ -32,7 +31,7 @@ def _run_hdc_command(cmd: list, **kwargs) -> subprocess.CompletedProcess: if _HDC_VERBOSE and result.returncode != 0: print(f"[HDC] Command failed with return code {result.returncode}") - if hasattr(result, 'stderr') and result.stderr: + if hasattr(result, "stderr") and result.stderr: print(f"[HDC] Error: {result.stderr}") return result @@ -150,11 +149,13 @@ def disconnect(self, address: str | None = None) -> tuple[bool, str]: [self.hdc_path, "tdisconn", device.device_id], capture_output=True, text=True, - timeout=5 + timeout=5, ) return True, "Disconnected all remote devices" - result = _run_hdc_command(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5) + result = _run_hdc_command( + cmd, capture_output=True, text=True, encoding="utf-8", timeout=5 + ) output = result.stdout + result.stderr return True, output.strip() or "Disconnected" @@ -278,7 +279,9 @@ def enable_tcpip( cmd.extend(["-t", device_id]) cmd.extend(["tmode", "port", str(port)]) - result = _run_hdc_command(cmd, capture_output=True, text=True, encoding="utf-8", timeout=10) + result = _run_hdc_command( + cmd, capture_output=True, text=True, encoding="utf-8", timeout=10 + ) output = result.stdout + result.stderr @@ -307,7 +310,9 @@ def get_device_ip(self, device_id: str | None = None) -> str | None: cmd.extend(["-t", device_id]) cmd.extend(["shell", "ifconfig"]) - result = _run_hdc_command(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5) + result = _run_hdc_command( + cmd, capture_output=True, text=True, encoding="utf-8", timeout=5 + ) # Parse IP from ifconfig output for line in result.stdout.split("\n"): @@ -339,9 +344,7 @@ def restart_server(self) -> tuple[bool, str]: """ try: # Kill server - _run_hdc_command( - [self.hdc_path, "kill"], capture_output=True, timeout=5 - ) + _run_hdc_command([self.hdc_path, "kill"], capture_output=True, timeout=5) time.sleep(TIMING_CONFIG.connection.server_restart_delay) diff --git a/phone_agent/hdc/device.py b/phone_agent/hdc/device.py index 9d693b42..dcef40a0 100644 --- a/phone_agent/hdc/device.py +++ b/phone_agent/hdc/device.py @@ -1,6 +1,7 @@ """Device control utilities for HarmonyOS automation.""" import os +import re import subprocess import time from typing import List, Optional, Tuple @@ -8,7 +9,7 @@ from phone_agent.config.apps_harmonyos import APP_ABILITIES, APP_PACKAGES from phone_agent.config.timing import TIMING_CONFIG from phone_agent.hdc.connection import _run_hdc_command -import re + def get_current_app(device_id: str | None = None) -> str: """ @@ -27,7 +28,7 @@ def get_current_app(device_id: str | None = None) -> str: hdc_prefix + ["shell", "aa", "dump", "-l"], capture_output=True, text=True, - encoding="utf-8" + encoding="utf-8", ) output = result.stdout # print(output) @@ -51,7 +52,7 @@ def get_current_app(device_id: str | None = None) -> str: for line in lines: # Track the current mission's bundle name if "app name [" in line: - match = re.search(r'\[([^\]]+)\]', line) + match = re.search(r"\[([^\]]+)\]", line) if match: current_bundle = match.group(1) @@ -71,9 +72,9 @@ def get_current_app(device_id: str | None = None) -> str: if package == foreground_bundle: return app_name # If bundle is found but not in our known apps, return the bundle name - print(f'Bundle is found but not in our known apps: {foreground_bundle}') + print(f"Bundle is found but not in our known apps: {foreground_bundle}") return foreground_bundle - print(f'No bundle is found') + print(f"No bundle is found") return "System Home" @@ -97,7 +98,7 @@ def tap( # HarmonyOS uses uitest uiInput click _run_hdc_command( hdc_prefix + ["shell", "uitest", "uiInput", "click", str(x), str(y)], - capture_output=True + capture_output=True, ) time.sleep(delay) @@ -122,7 +123,7 @@ def double_tap( # HarmonyOS uses uitest uiInput doubleClick _run_hdc_command( hdc_prefix + ["shell", "uitest", "uiInput", "doubleClick", str(x), str(y)], - capture_output=True + capture_output=True, ) time.sleep(delay) @@ -226,7 +227,7 @@ def back(device_id: str | None = None, delay: float | None = None) -> None: # HarmonyOS uses uitest uiInput keyEvent Back _run_hdc_command( hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "Back"], - capture_output=True + capture_output=True, ) time.sleep(delay) @@ -247,7 +248,7 @@ def home(device_id: str | None = None, delay: float | None = None) -> None: # HarmonyOS uses uitest uiInput keyEvent Home _run_hdc_command( hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "Home"], - capture_output=True + capture_output=True, ) time.sleep(delay) @@ -306,5 +307,6 @@ def _get_hdc_prefix(device_id: str | None) -> list: return ["hdc", "-t", device_id] return ["hdc"] + if __name__ == "__main__": print(get_current_app()) diff --git a/phone_agent/hdc/input.py b/phone_agent/hdc/input.py index 920cf7dd..45a7d2ce 100644 --- a/phone_agent/hdc/input.py +++ b/phone_agent/hdc/input.py @@ -25,8 +25,8 @@ def type_text(text: str, device_id: str | None = None) -> None: hdc_prefix = _get_hdc_prefix(device_id) # Handle multi-line text by splitting on newlines - if '\n' in text: - lines = text.split('\n') + if "\n" in text: + lines = text.split("\n") for i, line in enumerate(lines): if line: # Only process non-empty lines # Escape special characters for shell diff --git a/phone_agent/hdc/screenshot.py b/phone_agent/hdc/screenshot.py index 332d198c..d0d2a437 100644 --- a/phone_agent/hdc/screenshot.py +++ b/phone_agent/hdc/screenshot.py @@ -10,6 +10,7 @@ from typing import Tuple from PIL import Image + from phone_agent.hdc.connection import _run_hdc_command @@ -56,7 +57,11 @@ def get_screenshot(device_id: str | None = None, timeout: int = 10) -> Screensho # Check for screenshot failure (sensitive screen) output = result.stdout + result.stderr - if "fail" in output.lower() or "error" in output.lower() or "not found" in output.lower(): + if ( + "fail" in output.lower() + or "error" in output.lower() + or "not found" in output.lower() + ): # Try method 2: snapshot_display (older versions or different devices) result = _run_hdc_command( hdc_prefix + ["shell", "snapshot_display", "-f", remote_path], diff --git a/phone_agent/model/client.py b/phone_agent/model/client.py index 72377a61..f4e4c485 100644 --- a/phone_agent/model/client.py +++ b/phone_agent/model/client.py @@ -81,9 +81,20 @@ def request(self, messages: list[dict[str, Any]]) -> ModelResponse: raw_content = "" buffer = "" # Buffer to hold content that might be part of a marker - action_markers = ["finish(message=", "do(action="] + + # Action markers that signal the end of the thinking process + action_markers = ["", "finish(message=", "do(action="] + # Tags to hide from the output stream + hide_tags = ["", ""] + in_action_phase = False # Track if we've entered the action phase first_token_received = False + lang = self.config.lang + + # Print thinking header + print("\n" + "=" * 50) + print(f"💭 {get_message('thinking', lang)}:") + print("-" * 50) for chunk in stream: if len(chunk.choices) == 0: @@ -103,7 +114,12 @@ def request(self, messages: list[dict[str, Any]]) -> ModelResponse: buffer += content - # Check if any marker is fully present in buffer + # Hide tags from stream + for tag in hide_tags: + if tag in buffer: + buffer = buffer.replace(tag, "") + + # Check if any action marker is fully present in buffer marker_found = False for marker in action_markers: if marker in buffer: @@ -123,10 +139,10 @@ def request(self, messages: list[dict[str, Any]]) -> ModelResponse: if marker_found: continue # Continue to collect remaining content - # Check if buffer ends with a prefix of any marker + # Check if buffer ends with a prefix of any marker or tag # If so, don't print yet (wait for more content) is_potential_marker = False - for marker in action_markers: + for marker in action_markers + hide_tags: for i in range(1, len(marker)): if buffer.endswith(marker[:i]): is_potential_marker = True @@ -147,7 +163,6 @@ def request(self, messages: list[dict[str, Any]]) -> ModelResponse: # Print performance metrics lang = self.config.lang - print() print("=" * 50) print(f"⏱️ {get_message('performance_metrics', lang)}:") print("-" * 50) @@ -178,11 +193,11 @@ def _parse_response(self, content: str) -> tuple[str, str]: Parse the model response into thinking and action parts. Parsing rules: - 1. If content contains 'finish(message=', everything before is thinking, + 1. If content contains XML tags and , use them. + 2. If content contains 'finish(message=', everything before is thinking, everything from 'finish(message=' onwards is action. - 2. If rule 1 doesn't apply but content contains 'do(action=', - everything before is thinking, everything from 'do(action=' onwards is action. - 3. Fallback: If content contains '', use legacy parsing with XML tags. + 3. If content contains 'do(action=', everything before is thinking, + everything from 'do(action=' onwards is action. 4. Otherwise, return empty thinking and full content as action. Args: @@ -191,25 +206,45 @@ def _parse_response(self, content: str) -> tuple[str, str]: Returns: Tuple of (thinking, action). """ - # Rule 1: Check for finish(message= + # Rule 1: XML tag parsing (highest priority if tags exist) + if "" in content: + thinking = "" + if "" in content and "" in content: + thinking_parts = content.split("", 1)[1].split("", 1) + thinking = thinking_parts[0].strip() + elif "" in content: + thinking = ( + content.split("", 1)[1].split("", 1)[0].strip() + ) + else: + thinking = content.split("", 1)[0].strip() + + action_part = content.split("", 1)[1] + if "" in action_part: + action = action_part.split("", 1)[0].strip() + else: + action = action_part.strip() + + return thinking, action + + # Rule 2: Check for finish(message= if "finish(message=" in content: parts = content.split("finish(message=", 1) thinking = parts[0].strip() action = "finish(message=" + parts[1] + # Clean up trailing tags if any + if "" in action: + action = action.split("", 1)[0].strip() return thinking, action - # Rule 2: Check for do(action= + # Rule 3: Check for do(action= if "do(action=" in content: parts = content.split("do(action=", 1) thinking = parts[0].strip() action = "do(action=" + parts[1] - return thinking, action - - # Rule 3: Fallback to legacy XML tag parsing - if "" in content: - parts = content.split("", 1) - thinking = parts[0].replace("", "").replace("", "").strip() - action = parts[1].replace("", "").strip() + # Clean up trailing tags if any + if "" in action: + action = action.split("", 1)[0].strip() return thinking, action # Rule 4: No markers found, return content as action diff --git a/phone_agent/xctest/connection.py b/phone_agent/xctest/connection.py index deb29369..8a5373f2 100644 --- a/phone_agent/xctest/connection.py +++ b/phone_agent/xctest/connection.py @@ -17,7 +17,7 @@ class ConnectionType(Enum): class DeviceInfo: """Information about a connected iOS device.""" - device_id: str # UDID + device_id: str # UUID status: str connection_type: ConnectionType model: str | None = None @@ -76,23 +76,23 @@ def list_devices(self) -> list[DeviceInfo]: devices = [] for line in result.stdout.strip().split("\n"): - udid = line.strip() - if not udid: + uuid = line.strip() + if not uuid: continue # Determine connection type (network devices have specific format) conn_type = ( ConnectionType.NETWORK - if "-" in udid and len(udid) > 40 + if "-" in uuid and len(uuid) > 40 else ConnectionType.USB ) # Get detailed device info - device_info = self._get_device_details(udid) + device_info = self._get_device_details(uuid) devices.append( DeviceInfo( - device_id=udid, + device_id=uuid, status="connected", connection_type=conn_type, model=device_info.get("model"), @@ -112,19 +112,19 @@ def list_devices(self) -> list[DeviceInfo]: print(f"Error listing devices: {e}") return [] - def _get_device_details(self, udid: str) -> dict[str, str]: + def _get_device_details(self, uuid: str) -> dict[str, str]: """ Get detailed information about a specific device. Args: - udid: Device UDID. + uuid: Device UUID. Returns: Dictionary with device details. """ try: result = subprocess.run( - ["ideviceinfo", "-u", udid], + ["ideviceinfo", "-u", uuid], capture_output=True, text=True, timeout=5, @@ -154,7 +154,7 @@ def get_device_info(self, device_id: str | None = None) -> DeviceInfo | None: Get detailed information about a device. Args: - device_id: Device UDID. If None, uses first available device. + device_id: Device UUID. If None, uses first available device. Returns: DeviceInfo or None if not found. @@ -178,7 +178,7 @@ def is_connected(self, device_id: str | None = None) -> bool: Check if a device is connected. Args: - device_id: Device UDID to check. If None, checks if any device is connected. + device_id: Device UUID to check. If None, checks if any device is connected. Returns: True if connected, False otherwise. @@ -211,9 +211,7 @@ def is_wda_ready(self, timeout: int = 2) -> bool: ) return response.status_code == 200 except ImportError: - print( - "Error: requests library not found. Install it: pip install requests" - ) + print("Error: requests library not found. Install it: pip install requests") return False except Exception: return False @@ -276,7 +274,7 @@ def pair_device(self, device_id: str | None = None) -> tuple[bool, str]: Pair with an iOS device (required for some operations). Args: - device_id: Device UDID. If None, uses first available device. + device_id: Device UUID. If None, uses first available device. Returns: Tuple of (success, message). @@ -309,7 +307,7 @@ def get_device_name(self, device_id: str | None = None) -> str | None: Get the device name. Args: - device_id: Device UDID. If None, uses first available device. + device_id: Device UUID. If None, uses first available device. Returns: Device name string or None if not found. diff --git a/phone_agent/xctest/device.py b/phone_agent/xctest/device.py index 49fc379c..a4be177a 100644 --- a/phone_agent/xctest/device.py +++ b/phone_agent/xctest/device.py @@ -6,7 +6,8 @@ from phone_agent.config.apps_ios import APP_PACKAGES_IOS as APP_PACKAGES -SCALE_FACTOR = 3 # 3 for most modern iPhone +SCALE_FACTOR = 3 # 3 for most modern iPhone + def _get_wda_session_url(wda_url: str, session_id: str | None, endpoint: str) -> str: """ @@ -102,7 +103,12 @@ def tap( "id": "finger1", "parameters": {"pointerType": "touch"}, "actions": [ - {"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR}, + { + "type": "pointerMove", + "duration": 0, + "x": x / SCALE_FACTOR, + "y": y / SCALE_FACTOR, + }, {"type": "pointerDown", "button": 0}, {"type": "pause", "duration": 0.1}, {"type": "pointerUp", "button": 0}, @@ -151,7 +157,12 @@ def double_tap( "id": "finger1", "parameters": {"pointerType": "touch"}, "actions": [ - {"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR}, + { + "type": "pointerMove", + "duration": 0, + "x": x / SCALE_FACTOR, + "y": y / SCALE_FACTOR, + }, {"type": "pointerDown", "button": 0}, {"type": "pause", "duration": 100}, {"type": "pointerUp", "button": 0}, @@ -209,7 +220,12 @@ def long_press( "id": "finger1", "parameters": {"pointerType": "touch"}, "actions": [ - {"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR}, + { + "type": "pointerMove", + "duration": 0, + "x": x / SCALE_FACTOR, + "y": y / SCALE_FACTOR, + }, {"type": "pointerDown", "button": 0}, {"type": "pause", "duration": duration_ms}, {"type": "pointerUp", "button": 0}, diff --git a/phone_agent/xctest/input.py b/phone_agent/xctest/input.py index 9c8d8656..52d5733f 100644 --- a/phone_agent/xctest/input.py +++ b/phone_agent/xctest/input.py @@ -49,11 +49,16 @@ def type_text( # Send text to WDA response = requests.post( - url, json={"value": list(text), "frequency": frequency}, timeout=30, verify=False + url, + json={"value": list(text), "frequency": frequency}, + timeout=30, + verify=False, ) if response.status_code not in (200, 201): - print(f"Warning: Text input may have failed. Status: {response.status_code}") + print( + f"Warning: Text input may have failed. Status: {response.status_code}" + ) except ImportError: print("Error: requests library required. Install: pip install requests") @@ -86,11 +91,15 @@ def clear_text( if response.status_code == 200: data = response.json() - element_id = data.get("value", {}).get("ELEMENT") or data.get("value", {}).get("element-6066-11e4-a52e-4f735466cecf") + element_id = data.get("value", {}).get("ELEMENT") or data.get( + "value", {} + ).get("element-6066-11e4-a52e-4f735466cecf") if element_id: # Clear the element - clear_url = _get_wda_session_url(wda_url, session_id, f"element/{element_id}/clear") + clear_url = _get_wda_session_url( + wda_url, session_id, f"element/{element_id}/clear" + ) requests.post(clear_url, timeout=10, verify=False) return @@ -259,7 +268,10 @@ def set_pasteboard( url = f"{wda_url.rstrip('/')}/wda/setPasteboard" requests.post( - url, json={"content": text, "contentType": "plaintext"}, timeout=10, verify=False + url, + json={"content": text, "contentType": "plaintext"}, + timeout=10, + verify=False, ) except ImportError: diff --git a/phone_agent/xctest/screenshot.py b/phone_agent/xctest/screenshot.py index bbf4bdd6..c52351d3 100644 --- a/phone_agent/xctest/screenshot.py +++ b/phone_agent/xctest/screenshot.py @@ -33,7 +33,7 @@ def get_screenshot( Args: wda_url: WebDriverAgent URL. session_id: Optional WDA session ID. - device_id: Optional device UDID (for idevicescreenshot fallback). + device_id: Optional device UUID (for idevicescreenshot fallback). timeout: Timeout in seconds for screenshot operations. Returns: @@ -103,14 +103,12 @@ def _get_screenshot_wda( return None -def _get_screenshot_idevice( - device_id: str | None, timeout: int -) -> Screenshot | None: +def _get_screenshot_idevice(device_id: str | None, timeout: int) -> Screenshot | None: """ Capture screenshot using idevicescreenshot (libimobiledevice). Args: - device_id: Optional device UDID. + device_id: Optional device UUID. timeout: Timeout in seconds. Returns: @@ -126,9 +124,7 @@ def _get_screenshot_idevice( cmd.extend(["-u", device_id]) cmd.append(temp_path) - result = subprocess.run( - cmd, capture_output=True, text=True, timeout=timeout - ) + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) if result.returncode == 0 and os.path.exists(temp_path): # Read and encode image @@ -217,7 +213,7 @@ def get_screenshot_png( Args: wda_url: WebDriverAgent URL. session_id: Optional WDA session ID. - device_id: Optional device UDID. + device_id: Optional device UUID. Returns: PNG bytes or None if failed.