From 57797624918071c8db11832574656745147eeccd Mon Sep 17 00:00:00 2001 From: raiden-staging Date: Tue, 3 Feb 2026 15:17:19 +0100 Subject: [PATCH] template : groq llm orchestration + moondream vision --- README.md | 4 + .../moondream-groq-computer-use/.env.example | 2 + .../moondream-groq-computer-use/README.md | 64 ++ .../moondream-groq-computer-use/_gitignore | 6 + .../moondream-groq-computer-use/llm_loop.py | 899 ++++++++++++++++++ .../moondream-groq-computer-use/loop.py | 329 +++++++ .../moondream-groq-computer-use/main.py | 152 +++ .../moondream-groq-computer-use/moondream.py | 136 +++ .../pyproject.toml | 16 + .../moondream-groq-computer-use/session.py | 130 +++ .../tools/__init__.py | 23 + .../tools/computer.py | 276 ++++++ .../tools/types.py | 85 ++ .../moondream-groq-computer-use/.env.example | 2 + .../moondream-groq-computer-use/README.md | 64 ++ .../moondream-groq-computer-use/_gitignore | 39 + .../moondream-groq-computer-use/index.ts | 144 +++ .../moondream-groq-computer-use/llm_loop.ts | 797 ++++++++++++++++ .../moondream-groq-computer-use/loop.ts | 342 +++++++ .../moondream-groq-computer-use/moondream.ts | 135 +++ .../package-lock.json | 496 ++++++++++ .../moondream-groq-computer-use/package.json | 16 + .../moondream-groq-computer-use/session.ts | 199 ++++ .../tools/computer.ts | 311 ++++++ .../tools/types/computer.ts | 68 ++ .../moondream-groq-computer-use/tsconfig.json | 23 + 26 files changed, 4758 insertions(+) create mode 100644 pkg/templates/python/moondream-groq-computer-use/.env.example create mode 100644 pkg/templates/python/moondream-groq-computer-use/README.md create mode 100644 pkg/templates/python/moondream-groq-computer-use/_gitignore create mode 100644 pkg/templates/python/moondream-groq-computer-use/llm_loop.py create mode 100644 pkg/templates/python/moondream-groq-computer-use/loop.py create mode 100644 pkg/templates/python/moondream-groq-computer-use/main.py create mode 100644 pkg/templates/python/moondream-groq-computer-use/moondream.py create mode 100644 pkg/templates/python/moondream-groq-computer-use/pyproject.toml create mode 100644 pkg/templates/python/moondream-groq-computer-use/session.py create mode 100644 pkg/templates/python/moondream-groq-computer-use/tools/__init__.py create mode 100644 pkg/templates/python/moondream-groq-computer-use/tools/computer.py create mode 100644 pkg/templates/python/moondream-groq-computer-use/tools/types.py create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/.env.example create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/README.md create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/_gitignore create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/index.ts create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/llm_loop.ts create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/loop.ts create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/moondream.ts create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/package-lock.json create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/package.json create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/session.ts create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/tools/computer.ts create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/tools/types/computer.ts create mode 100644 pkg/templates/typescript/moondream-groq-computer-use/tsconfig.json diff --git a/README.md b/README.md index 398bb45..53ddc5a 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,7 @@ Commands with JSON output support: - `anthropic-computer-use` - Anthropic Computer Use prompt loop - `openai-computer-use` - OpenAI Computer Use Agent sample - `gemini-computer-use` - Implements a Gemini computer use agent (TypeScript only) + - `moondream-groq-computer-use` - Moondream + Groq computer use agent (TypeScript + Python) - `openagi-computer-use` - OpenAGI Lux computer-use models (Python only) - `magnitude` - Magnitude framework sample (TypeScript only) - `claude-agent-sdk` - Claude Agent SDK browser automation agent @@ -517,6 +518,9 @@ kernel create --name my-agent --language ts --template stagehand # Create a Python Computer Use app kernel create --name my-cu-app --language py --template anthropic-computer-use +# Create a Moondream + Groq Computer Use app (TypeScript or Python) +kernel create --name my-moondream-cu --language ts --template moondream-groq-computer-use + # Create a Claude Agent SDK app (TypeScript or Python) kernel create --name my-claude-agent --language ts --template claude-agent-sdk ``` diff --git a/pkg/templates/python/moondream-groq-computer-use/.env.example b/pkg/templates/python/moondream-groq-computer-use/.env.example new file mode 100644 index 0000000..880a1af --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/.env.example @@ -0,0 +1,2 @@ +MOONDREAM_API_KEY= +GROQ_API_KEY= diff --git a/pkg/templates/python/moondream-groq-computer-use/README.md b/pkg/templates/python/moondream-groq-computer-use/README.md new file mode 100644 index 0000000..6542dd5 --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/README.md @@ -0,0 +1,64 @@ +# Kernel Python Sample App - Moondream Computer Use + +This Kernel app runs a lightweight computer-use agent powered by Moondream vision models, Groq fast LLM orchestration. + +## Setup + +1. Get your API keys: + - **Moondream**: [moondream.ai](https://moondream.ai) + - **Groq**: [console.groq.com](https://console.groq.com) + +2. Deploy the app: +```bash +kernel login +cp .env.example .env # Add your MOONDREAM_API_KEY and GROQ_API_KEY +kernel deploy main.py --env-file .env +``` + +## Usage + +Natural-language query (Groq LLM orchestrates Moondream + Kernel): +```bash +kernel invoke python-moondream-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}' +``` + +Structured steps (optional fallback for deterministic automation): +```bash +kernel invoke python-moondream-cua cua-task --payload '{ + "steps": [ + {"action": "navigate", "url": "https://example.com"}, + {"action": "caption"}, + {"action": "click", "target": "More information link", "retries": 4}, + {"action": "type", "target": "Search input", "text": "kernel", "press_enter": true} + ] +}' +``` + +## Step Actions + +Each step is a JSON object with an `action` field. Supported actions: + +- `navigate`: `{ "url": "https://..." }` +- `click`: `{ "target": "Button label or description" }` +- `type`: `{ "target": "Input field description", "text": "...", "press_enter": false }` +- `scroll`: `{ "direction": "down" }` or `{ "x": 0.5, "y": 0.5, "direction": "down" }` +- `query`: `{ "question": "Is there a login button?" }` +- `caption`: `{ "length": "short" | "normal" | "long" }` +- `wait`: `{ "seconds": 2.5 }` +- `key`: `{ "keys": "ctrl+l" }` +- `go_back`, `go_forward`, `search`, `open_web_browser` + +Optional step fields: +- `retries`: override retry attempts for point/click/type +- `retry_delay_ms`: wait between retries +- `x`, `y`: normalized (0-1) or pixel coordinates to bypass Moondream pointing (pixel coords use detected screenshot size) + +## Replay Recording + +Add `"record_replay": true` to the payload to capture a video replay (paid Kernel plans only). + +## Notes + +- The agent uses Moondream for visual reasoning and pointing. +- Kernel screenshots are PNG; Moondream queries are sent as base64 data URLs. +- The Groq LLM must output JSON actions; the agent repairs and parses JSON with json-repair. diff --git a/pkg/templates/python/moondream-groq-computer-use/_gitignore b/pkg/templates/python/moondream-groq-computer-use/_gitignore new file mode 100644 index 0000000..deac005 --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/_gitignore @@ -0,0 +1,6 @@ +.venv/ +__pycache__/ +*.pyc +.env +.env.local +uv.lock diff --git a/pkg/templates/python/moondream-groq-computer-use/llm_loop.py b/pkg/templates/python/moondream-groq-computer-use/llm_loop.py new file mode 100644 index 0000000..38fd1d9 --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/llm_loop.py @@ -0,0 +1,899 @@ +""" +Groq-driven Moondream + Kernel agent loop. +""" + +from __future__ import annotations + +import asyncio +import json +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple + +from groq import Groq +from json_repair import repair_json + +from moondream import MoondreamClient +from tools import ( + COORDINATE_SCALE, + ComputerAction, + ComputerTool, + DEFAULT_SCREEN_SIZE, + ScreenSize, +) + + +MODEL_NAME = "openai/gpt-oss-120b" + + +SYSTEM_PROMPT = """You are a browser-automation controller. You do NOT see images. +You must decide actions and call Moondream for any visual understanding. +Return ONLY a single JSON object that matches the schema below. +Parsing note: the client will extract the substring between the first '{' and last '}' and run jsonrepair on it. +Therefore, do NOT include any extra text before or after the JSON object. + +Browser context: +- The browser is already open. Do NOT request an open_browser action. + +Action policy: +- Bundle multiple actions when you can (e.g., navigate -> moondream_query). +- Use moondream_* actions for all visual understanding; keep queries short and specific. +- Never emit moondream_query without a clear question. +- Use click_at/type_text_at/scroll_at with coordinates in 0-1000 normalized scale. +- If you need coordinates, call moondream_point first. +- Prefer type_text_at with press_enter=true to submit searches; use key_combination mainly for shortcuts. +- You may include post_wait_ms in args to wait after an action (agent handles it). +- If the task requires a URL or page identity, call page_info after the relevant navigation/click. +- If your actions did not change state, reassess with a new Moondream question rather than repeating. +- If you need a specific item URL/details, open a specific item page (not a results list) and confirm it. +- If a click does not change the page, try a different target or use hover_at to reveal link text/URL. +- When opening an item, prefer clicking the title or image; verify you reached a detail page before returning its URL. +- If list items offer separate “comments/discussion” links and “title/article” links, click the title/article link unless the task explicitly asks for comments. +- On list pages with metadata/source links, click the title line (main link), not the source/domain/metadata line. +- If the task includes constraints, use on-screen evidence to select a qualifying item before answering. +- On list pages, identify a candidate item that matches constraints, then point to its title/image and click to open. +- Do not answer until you can confirm you are on the target page type (e.g., a single-item detail page). +- For “first/top result” tasks, click the topmost result item (not navigation, ads, or comments). +- When returning a URL, use the most recent page_info URL from the current page. +- Before final response for item-specific tasks, confirm the page type with moondream_query. +- If a click doesn't open the item, try a different target or a double-click by setting clicks: 2. If you suspect a new tab opened, use key_combination with ctrl+tab and re-check page_info. +- Use action result field state_changed to decide if a click/scroll had an effect; if false, adjust target or strategy. +- If the user specifies a site to search (e.g., Wikipedia), use that site's search first; only switch to another search engine if the site search fails. +- Never output placeholders like {{x}}, {{url}}, or in actions or final_response. +- Do not ask Moondream to infer the URL or page title; use page_info for those. +- If the task specifies a domain/URL, avoid leaving that domain unless the task explicitly requires it; if page_info shows an unexpected domain, go_back or navigate to the intended domain. +- If the task specifies a domain, your final_response URL must include that domain. +- After typing a search query, submit it (press_enter or search button). Avoid clicking unrelated suggestions or ads. +- For tasks like “first/top result,” ask Moondream to point at the first item or top result and click it. +- When moondream_point returns coordinates (x_norm/y_norm), use those exact numbers in click_at (x,y). Never use placeholders. +- Do not navigate to URLs derived from Moondream answers. Only navigate to URLs provided by the user or confirmed via page_info. +- If search results are not found after a couple of attempts, fallback to direct navigation to the most likely official page. +- Moondream query quality matters. Ask short, concrete, visual questions. Avoid vague or multi-part questions. +- When the task requires price or currency, verify the price on the detail page with a targeted Moondream query and return the exact text. +- For dense result grids, you may use moondream_detect with objects like "product image" or "item card" and click the topmost box. +- Never ask Moondream for a URL or link; only use page_info for URLs. + +Moondream query examples (good vs bad): +GOOD: "Is there a search box on this page?" +BAD: "What should I do next?" +GOOD: "What is the exact price shown for the highlighted item?" +BAD: "Tell me everything about this page." +GOOD: "Is this a single-item detail page?" +BAD: "Is this page good?" +GOOD: "Which button says 'Sign in'?" +BAD: "Find the right thing." +BAD: "What is the URL for this page?" + +Moondream query templates: +- Presence: "Is there a on the page?" +- Identification: "What is the exact text of the ?" +- Page type: "Is this a page?" +- Verification: "Does the page show the item I just clicked?" +- Result matching: "Which result shows the domain ?" +- If asked to use a search box, attempt a search interaction before using direct navigation; only fall back if stuck, and mention fallback in final_response. +- If the user requests JSON output, ensure final_response is valid JSON that matches the requested fields. +- When setting done=true, always include a non-empty final_response with concrete values (no placeholders like {{...}}). +- Stop when the task is complete by setting done=true and final_response. + +JSON Schema: +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "actions": { + "type": "array", + "items": { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": [ + "navigate", + "click_at", + "hover_at", + "type_text_at", + "scroll_document", + "scroll_at", + "go_back", + "go_forward", + "key_combination", + "drag_and_drop", + "wait", + "moondream_query", + "moondream_caption", + "moondream_point", + "moondream_detect", + "page_info", + "done", + "fail" + ] + }, + "args": { "type": "object" } + }, + "required": ["action", "args"], + "additionalProperties": false + } + }, + "done": { "type": "boolean" }, + "final_response": { "type": "string" }, + "error": { "type": "string" } + }, + "required": ["actions"], + "additionalProperties": false +} + +Examples (valid JSON): +{"actions":[{"action":"navigate","args":{"url":"https://example.com"}},{"action":"moondream_caption","args":{"length":"short"}}]} +{"actions":[{"action":"moondream_point","args":{"object":"login button"}},{"action":"click_at","args":{"x":512,"y":412}}]} +{"actions":[],"done":true,"final_response":"Logged in and reached the dashboard."} +{"actions":[],"done":true,"final_response":"{\"title\":\"Example Domain\",\"url\":\"https://example.com\"}"} +""" + + +@dataclass +class LlmOptions: + max_iterations: int = 40 + temperature: float = 1.0 + max_completion_tokens: int = 65536 + top_p: float = 1 + post_action_wait_ms: int = 500 + reasoning_effort: str = "medium" + + +@dataclass +class StepLog: + step: int + action: str + status: str + detail: str + output: Optional[str] = None + + +async def run_llm_agent( + *, + query: str, + moondream: MoondreamClient, + kernel_tool: ComputerTool, + groq_api_key: str, + options: LlmOptions, +) -> Dict[str, Any]: + groq = Groq(api_key=groq_api_key) + + messages: List[Dict[str, str]] = [ + {"role": "system", "content": SYSTEM_PROMPT}, + { + "role": "user", + "content": ( + "Task: " + + query + + "\nReturn a JSON object with an actions array. " + + "Bundle multiple actions when sensible." + ), + }, + ] + + logs: List[StepLog] = [] + answers: List[str] = [] + last_screenshot: Optional[str] = None + last_page_url: Optional[str] = None + last_point_norm: Optional[Tuple[int, int]] = None + error: Optional[str] = None + + for iteration in range(1, options.max_iterations + 1): + try: + raw = await asyncio.to_thread( + _groq_completion, + groq, + messages, + options, + ) + except Exception as exc: + messages.append( + { + "role": "user", + "content": "Your last output was invalid. Return ONLY a JSON object that matches the schema.", + } + ) + try: + raw = await asyncio.to_thread( + _groq_completion, + groq, + messages, + options, + ) + except Exception as exc2: + error = str(exc2) + raw = '{"actions":[]}' + + batch_payload = _parse_json_action(raw) + messages.append({"role": "assistant", "content": json.dumps(batch_payload)}) + + actions = _normalize_actions(batch_payload) + results: List[Dict[str, Any]] = [] + done_flag = bool(batch_payload.get("done")) + final_response = str(batch_payload.get("final_response", "")) if done_flag else "" + + try: + for action_item in actions: + action = str(action_item.get("action", "")).strip() + args = action_item.get("args") or {} + if not action: + results.append({"action": "", "status": "failed", "detail": "missing action"}) + continue + + if action == "navigate": + url = str(args.get("url", "")).strip() + if not url: + raise ValueError("navigate requires url") + if "{{" in url or "}}" in url or "placeholder" in url.lower(): + logs.append(StepLog(iteration, action, "failed", "navigate url is placeholder")) + results.append( + {"action": action, "status": "failed", "detail": "navigate url is placeholder"} + ) + continue + result = await kernel_tool.execute_action(ComputerAction.NAVIGATE, {"url": url}) + last_screenshot, state_changed = _update_screenshot_with_state( + result, last_screenshot + ) + logs.append(StepLog(iteration, action, _status(result), f"Navigated to {url}")) + results.append( + { + "action": action, + "status": _status(result), + "detail": f"navigated to {url}", + "state_changed": state_changed, + } + ) + await _post_wait(action, args, options) + + elif action == "click_at": + try: + x, y = _coerce_coords(args, kernel_tool.screen_size) + except Exception as exc: + if last_point_norm: + x, y = last_point_norm + results.append( + { + "action": action, + "status": "success", + "detail": "used last moondream_point", + "used_last_point": True, + } + ) + else: + logs.append(StepLog(iteration, action, "failed", str(exc))) + results.append({"action": action, "status": "failed", "detail": str(exc)}) + continue + result = await kernel_tool.execute_action( + ComputerAction.CLICK_AT, + {"x": x, "y": y}, + ) + last_screenshot, state_changed = _update_screenshot_with_state( + result, last_screenshot + ) + logs.append(StepLog(iteration, action, _status(result), "Clicked at coordinates")) + results.append( + { + "action": action, + "status": _status(result), + "detail": "clicked", + "state_changed": state_changed, + } + ) + await _post_wait(action, args, options) + + elif action == "hover_at": + try: + x, y = _coerce_coords(args, kernel_tool.screen_size) + except Exception as exc: + logs.append(StepLog(iteration, action, "failed", str(exc))) + results.append({"action": action, "status": "failed", "detail": str(exc)}) + continue + result = await kernel_tool.execute_action( + ComputerAction.HOVER_AT, + {"x": x, "y": y}, + ) + last_screenshot, state_changed = _update_screenshot_with_state( + result, last_screenshot + ) + logs.append(StepLog(iteration, action, _status(result), "Hovered at coordinates")) + results.append( + { + "action": action, + "status": _status(result), + "detail": "hovered", + "state_changed": state_changed, + } + ) + await _post_wait(action, args, options) + + elif action == "type_text_at": + try: + x, y = _coerce_coords(args, kernel_tool.screen_size) + except Exception as exc: + if last_point_norm: + x, y = last_point_norm + results.append( + { + "action": action, + "status": "success", + "detail": "used last moondream_point", + "used_last_point": True, + } + ) + else: + logs.append(StepLog(iteration, action, "failed", str(exc))) + results.append({"action": action, "status": "failed", "detail": str(exc)}) + continue + text = args.get("text") + if text is None: + raise ValueError("type_text_at requires text") + payload = { + "x": x, + "y": y, + "text": str(text), + "press_enter": bool(args.get("press_enter", False)), + "clear_before_typing": bool(args.get("clear_before_typing", True)), + } + result = await kernel_tool.execute_action(ComputerAction.TYPE_TEXT_AT, payload) + last_screenshot, state_changed = _update_screenshot_with_state( + result, last_screenshot + ) + logs.append(StepLog(iteration, action, _status(result), "Typed text")) + results.append( + { + "action": action, + "status": _status(result), + "detail": "typed", + "state_changed": state_changed, + } + ) + await _post_wait(action, args, options) + + elif action == "scroll_document": + direction = str(args.get("direction", "down")) + payload: Dict[str, Any] = {"direction": direction} + if args.get("magnitude") is not None: + payload["magnitude"] = int(args["magnitude"]) + result = await kernel_tool.execute_action(ComputerAction.SCROLL_DOCUMENT, payload) + last_screenshot, state_changed = _update_screenshot_with_state( + result, last_screenshot + ) + logs.append(StepLog(iteration, action, _status(result), f"Scrolled {direction}")) + results.append( + { + "action": action, + "status": _status(result), + "detail": f"scrolled {direction}", + "state_changed": state_changed, + } + ) + await _post_wait(action, args, options) + + elif action == "scroll_at": + try: + x, y = _coerce_coords(args, kernel_tool.screen_size) + except Exception as exc: + logs.append(StepLog(iteration, action, "failed", str(exc))) + results.append({"action": action, "status": "failed", "detail": str(exc)}) + continue + direction = str(args.get("direction", "down")) + payload = {"x": x, "y": y, "direction": direction} + if args.get("magnitude") is not None: + payload["magnitude"] = int(args["magnitude"]) + result = await kernel_tool.execute_action(ComputerAction.SCROLL_AT, payload) + last_screenshot, state_changed = _update_screenshot_with_state( + result, last_screenshot + ) + logs.append(StepLog(iteration, action, _status(result), f"Scrolled {direction}")) + results.append( + { + "action": action, + "status": _status(result), + "detail": f"scrolled {direction}", + "state_changed": state_changed, + } + ) + await _post_wait(action, args, options) + + elif action == "go_back": + result = await kernel_tool.execute_action(ComputerAction.GO_BACK, {}) + last_screenshot, state_changed = _update_screenshot_with_state( + result, last_screenshot + ) + logs.append(StepLog(iteration, action, _status(result), "Went back")) + results.append( + { + "action": action, + "status": _status(result), + "detail": "went back", + "state_changed": state_changed, + } + ) + await _post_wait(action, args, options) + + elif action == "go_forward": + result = await kernel_tool.execute_action(ComputerAction.GO_FORWARD, {}) + last_screenshot, state_changed = _update_screenshot_with_state( + result, last_screenshot + ) + logs.append(StepLog(iteration, action, _status(result), "Went forward")) + results.append( + { + "action": action, + "status": _status(result), + "detail": "went forward", + "state_changed": state_changed, + } + ) + await _post_wait(action, args, options) + + elif action == "key_combination": + keys = str(args.get("keys", "")).strip() + if not keys: + raise ValueError("key_combination requires keys") + result = await kernel_tool.execute_action(ComputerAction.KEY_COMBINATION, {"keys": keys}) + last_screenshot, state_changed = _update_screenshot_with_state( + result, last_screenshot + ) + logs.append(StepLog(iteration, action, _status(result), f"Pressed {keys}")) + results.append( + { + "action": action, + "status": _status(result), + "detail": f"pressed {keys}", + "state_changed": state_changed, + } + ) + await _post_wait(action, args, options) + + elif action == "wait": + seconds = float(args.get("seconds", 1)) + await asyncio.sleep(seconds) + logs.append(StepLog(iteration, action, "success", f"Waited {seconds:.2f}s")) + results.append({"action": action, "status": "success", "detail": f"waited {seconds:.2f}s"}) + + elif action == "moondream_query": + question = str(args.get("question", "")).strip() + if not question: + logs.append(StepLog(iteration, action, "failed", "Missing question")) + results.append( + {"action": action, "status": "failed", "detail": "missing question"} + ) + continue + screenshot = await _ensure_screenshot(kernel_tool, last_screenshot) + last_screenshot = screenshot + answer = await moondream.query(screenshot, question) + answers.append(answer) + logs.append(StepLog(iteration, action, "success", "Answered question", output=answer)) + results.append({"action": action, "status": "success", "answer": answer}) + + elif action == "moondream_caption": + length = str(args.get("length", "normal")) + screenshot = await _ensure_screenshot(kernel_tool, last_screenshot) + last_screenshot = screenshot + caption = await moondream.caption(screenshot, length) + answers.append(caption) + logs.append(StepLog(iteration, action, "success", "Captioned image", output=caption)) + results.append({"action": action, "status": "success", "caption": caption}) + + elif action == "drag_and_drop": + if "x" not in args or "y" not in args: + raise ValueError("drag_and_drop requires x and y") + if "destination_x" not in args or "destination_y" not in args: + raise ValueError("drag_and_drop requires destination_x and destination_y") + try: + start_x, start_y = _coerce_coords( + {"x": args.get("x"), "y": args.get("y")}, + kernel_tool.screen_size, + ) + end_x, end_y = _coerce_coords( + {"x": args.get("destination_x"), "y": args.get("destination_y")}, + kernel_tool.screen_size, + ) + except Exception as exc: + logs.append(StepLog(iteration, action, "failed", str(exc))) + results.append({"action": action, "status": "failed", "detail": str(exc)}) + continue + result = await kernel_tool.execute_action( + ComputerAction.DRAG_AND_DROP, + { + "x": start_x, + "y": start_y, + "destination_x": end_x, + "destination_y": end_y, + }, + ) + last_screenshot, state_changed = _update_screenshot_with_state( + result, last_screenshot + ) + logs.append(StepLog(iteration, action, _status(result), "Dragged element")) + results.append( + { + "action": action, + "status": _status(result), + "detail": "dragged", + "state_changed": state_changed, + } + ) + await _post_wait(action, args, options) + + elif action == "moondream_point": + obj = str(args.get("object", "")).strip() + if not obj: + raise ValueError("moondream_point requires object") + screenshot = await _ensure_screenshot(kernel_tool, last_screenshot) + last_screenshot = screenshot + point = await moondream.point(screenshot, obj) + if not point: + logs.append(StepLog(iteration, action, "failed", "No point found")) + results.append({"action": action, "status": "failed", "detail": "no point found"}) + else: + screen = kernel_tool.screen_size + payload = _point_payload(point.x, point.y, screen) + last_point_norm = (payload["x_norm"], payload["y_norm"]) + logs.append(StepLog(iteration, action, "success", "Point found", output=str(payload))) + results.append({"action": action, "status": "success", **payload}) + + elif action == "moondream_detect": + obj = str(args.get("object", "")).strip() + if not obj: + raise ValueError("moondream_detect requires object") + screenshot = await _ensure_screenshot(kernel_tool, last_screenshot) + last_screenshot = screenshot + detections = await moondream.detect(screenshot, obj) + payload = _detect_payload(detections, kernel_tool.screen_size) + logs.append(StepLog(iteration, action, "success", "Detection results", output=str(payload))) + results.append({"action": action, "status": "success", **payload}) + + elif action == "page_info": + payload = await _page_info(kernel_tool) + url_value = payload.get("url") if isinstance(payload, dict) else None + state_changed = bool(url_value and url_value != last_page_url) + if url_value: + last_page_url = str(url_value) + payload["state_changed"] = state_changed + status_value = "failed" if payload.get("error") else "success" + logs.append( + StepLog(iteration, action, status_value, "Page info", output=str(payload)) + ) + results.append({"action": action, "status": status_value, **payload}) + + elif action == "done": + done_flag = True + final_response = str(args.get("final_response", "")) + break + + elif action == "fail": + error = str(args.get("error", "unknown error")) + logs.append(StepLog(iteration, action, "failed", error)) + results.append({"action": action, "status": "failed", "detail": error}) + done_flag = True + break + + else: + raise ValueError(f"Unknown action: {action}") + + except Exception as exc: + message = str(exc) + logs.append(StepLog(iteration, "batch", "failed", message)) + error = message + results.append({"action": "batch", "status": "failed", "detail": message}) + + _append_result(messages, "batch", {"results": results}) + + if done_flag and ( + not final_response + or "{{" in final_response + or "}}" in final_response + or "placeholder" in final_response.lower() + ): + messages.append( + { + "role": "user", + "content": ( + "final_response must be non-empty and use concrete values (no placeholders). " + "Return a corrected JSON object." + ), + } + ) + done_flag = False + final_response = "" + + if done_flag: + stripped = final_response.strip() + if stripped.startswith("{"): + try: + repaired = repair_json(stripped) + parsed = json.loads(repaired) + if not isinstance(parsed, dict): + raise ValueError("final_response JSON is not an object") + except Exception: + messages.append( + { + "role": "user", + "content": ( + "final_response looks like JSON but is invalid. " + "Return a valid JSON object string." + ), + } + ) + done_flag = False + final_response = "" + + if done_flag: + urls = _extract_urls(final_response) + if urls and not last_page_url: + messages.append( + { + "role": "user", + "content": ( + "You returned a URL but did not call page_info. " + "Call page_info on the current page before final_response." + ), + } + ) + done_flag = False + final_response = "" + elif urls and last_page_url and any(url != last_page_url for url in urls): + messages.append( + { + "role": "user", + "content": ( + "The returned URL does not match the current page_info URL. " + "Navigate to the correct page and then return that URL." + ), + } + ) + done_flag = False + final_response = "" + + if done_flag: + summary = f"Completed {sum(1 for log in logs if log.status == 'success')}/{len(logs)} steps" + result_payload = { + "summary": summary, + "final_response": final_response, + "steps": [log.__dict__ for log in logs], + "answers": answers, + } + return {"final_response": json.dumps(result_payload, indent=2), "error": error} + + summary = f"Completed {sum(1 for log in logs if log.status == 'success')}/{len(logs)} steps" + result_payload = { + "summary": summary, + "final_response": "", + "steps": [log.__dict__ for log in logs], + "answers": answers, + } + + return {"final_response": json.dumps(result_payload, indent=2), "error": error} + + +def _groq_completion(groq: Groq, messages: List[Dict[str, str]], options: LlmOptions) -> str: + completion = groq.chat.completions.create( + model=MODEL_NAME, + messages=messages, + temperature=options.temperature, + max_completion_tokens=options.max_completion_tokens, + top_p=options.top_p, + reasoning_effort=options.reasoning_effort, + stream=False, + response_format={"type": "json_object"}, + ) + return completion.choices[0].message.content or "" + + +def _parse_json_action(raw: str) -> Dict[str, Any]: + start = raw.find("{") + end = raw.rfind("}") + if start == -1 or end == -1 or end <= start: + raise ValueError("No JSON object found in LLM response") + snippet = raw[start : end + 1] + repaired = repair_json(snippet) + data = json.loads(repaired) + if not isinstance(data, dict): + raise ValueError("LLM JSON did not produce an object") + return data + + +def _extract_urls(final_response: str) -> List[str]: + text = final_response.strip() + if not (text.startswith("{") and text.endswith("}")): + return [] + try: + repaired = repair_json(text) + data = json.loads(repaired) + except Exception: + return [] + if not isinstance(data, dict): + return [] + urls: List[str] = [] + for key, value in data.items(): + if "url" in str(key).lower() and isinstance(value, str): + urls.append(value) + return urls + + +def _normalize_actions(payload: Dict[str, Any]) -> List[Dict[str, Any]]: + if "actions" in payload and isinstance(payload["actions"], list): + return [item for item in payload["actions"] if isinstance(item, dict)] + action = payload.get("action") + args = payload.get("args") if isinstance(payload.get("args"), dict) else {} + if action: + return [{"action": action, "args": args}] + return [] + + +def _append_result(messages: List[Dict[str, str]], action: str, payload: Any) -> None: + messages.append( + { + "role": "user", + "content": json.dumps({"type": "action_result", "action": action, "output": payload}), + } + ) + + +def _status(result: Any) -> str: + return "failed" if result and getattr(result, "error", None) else "success" + + +def _update_screenshot(result: Any, last_screenshot: Optional[str]) -> Optional[str]: + if result and getattr(result, "base64_image", None): + return result.base64_image + return last_screenshot + + +def _update_screenshot_with_state( + result: Any, last_screenshot: Optional[str] +) -> Tuple[Optional[str], bool]: + new_screenshot = _update_screenshot(result, last_screenshot) + if new_screenshot is None: + return last_screenshot, False + if last_screenshot is None: + return new_screenshot, True + return new_screenshot, new_screenshot != last_screenshot + + +def _coerce_coords(args: Dict[str, Any], screen_size: ScreenSize) -> Tuple[int, int]: + if "x" not in args or "y" not in args: + raise ValueError("x and y are required") + if isinstance(args.get("x"), str) and ("{" in args["x"] or "}" in args["x"]): + raise ValueError("x must be a number, not a placeholder") + if isinstance(args.get("y"), str) and ("{" in args["y"] or "}" in args["y"]): + raise ValueError("y must be a number, not a placeholder") + x = float(args["x"]) + y = float(args["y"]) + if 0 <= x <= 1 and 0 <= y <= 1: + return int(x * COORDINATE_SCALE), int(y * COORDINATE_SCALE) + if 0 <= x <= COORDINATE_SCALE and 0 <= y <= COORDINATE_SCALE: + return int(x), int(y) + width = screen_size.width if screen_size else DEFAULT_SCREEN_SIZE.width + height = screen_size.height if screen_size else DEFAULT_SCREEN_SIZE.height + return int((x / width) * COORDINATE_SCALE), int((y / height) * COORDINATE_SCALE) + + +async def _ensure_screenshot(computer: ComputerTool, last_screenshot: Optional[str]) -> str: + if last_screenshot: + return last_screenshot + result = await computer.screenshot() + if result.error or not result.base64_image: + raise RuntimeError(result.error or "Failed to capture screenshot") + return result.base64_image + + +async def _post_wait(action: str, args: Dict[str, Any], options: LlmOptions) -> None: + wait_actions = { + "navigate", + "click_at", + "hover_at", + "type_text_at", + "scroll_document", + "scroll_at", + "go_back", + "go_forward", + "key_combination", + "drag_and_drop", + } + if action not in wait_actions: + return + override = args.get("post_wait_ms") + if isinstance(override, (int, float)): + wait_ms = int(override) + else: + wait_ms = int(options.post_action_wait_ms) + if wait_ms > 0: + await asyncio.sleep(wait_ms / 1000) + + +async def _page_info(kernel_tool: ComputerTool) -> Dict[str, Any]: + try: + from playwright.async_api import async_playwright + except Exception: + return {"error": "playwright not installed"} + + try: + browser_info = kernel_tool.kernel.browsers.retrieve(kernel_tool.session_id) + cdp_url = getattr(browser_info, "cdp_ws_url", None) + except Exception as exc: + return {"error": f"failed to retrieve cdp url: {exc}"} + + if not cdp_url: + return {"error": "cdp url not available"} + + try: + async with async_playwright() as p: + browser = await p.chromium.connect_over_cdp(cdp_url) + pages = [] + for context in browser.contexts: + pages.extend(context.pages) + if not pages: + page = await browser.new_page() + else: + page = pages[-1] + title = await page.title() + url = page.url + await browser.close() + return {"url": url, "title": title} + except Exception as exc: + return {"error": f"playwright error: {exc}"} + + +def _point_payload(x: float, y: float, screen: ScreenSize) -> Dict[str, Any]: + x_norm = int(x * COORDINATE_SCALE) + y_norm = int(y * COORDINATE_SCALE) + x_px = int(x * screen.width) + y_px = int(y * screen.height) + return { + "x": x, + "y": y, + "x_norm": x_norm, + "y_norm": y_norm, + "x_px": x_px, + "y_px": y_px, + "screen": {"width": screen.width, "height": screen.height}, + } + + +def _detect_payload(detections: List[Dict[str, float]], screen: ScreenSize) -> Dict[str, Any]: + converted: List[Dict[str, Any]] = [] + for det in detections: + x_min = float(det.get("x_min", 0)) + y_min = float(det.get("y_min", 0)) + x_max = float(det.get("x_max", 0)) + y_max = float(det.get("y_max", 0)) + converted.append( + { + "x_min": x_min, + "y_min": y_min, + "x_max": x_max, + "y_max": y_max, + "x_min_norm": int(x_min * COORDINATE_SCALE), + "y_min_norm": int(y_min * COORDINATE_SCALE), + "x_max_norm": int(x_max * COORDINATE_SCALE), + "y_max_norm": int(y_max * COORDINATE_SCALE), + "x_min_px": int(x_min * screen.width), + "y_min_px": int(y_min * screen.height), + "x_max_px": int(x_max * screen.width), + "y_max_px": int(y_max * screen.height), + } + ) + return {"objects": converted, "screen": {"width": screen.width, "height": screen.height}} diff --git a/pkg/templates/python/moondream-groq-computer-use/loop.py b/pkg/templates/python/moondream-groq-computer-use/loop.py new file mode 100644 index 0000000..4562cac --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/loop.py @@ -0,0 +1,329 @@ +""" +Moondream computer-use agent loop. +""" + +from __future__ import annotations + +import asyncio +import json +import re +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional, Tuple + +from kernel import Kernel + +from moondream import MoondreamClient +from tools import ( + ComputerAction, + ComputerTool, + COORDINATE_SCALE, + DEFAULT_SCREEN_SIZE, + ScreenSize, +) + + +URL_RE = re.compile(r"https?://[^\s)]+", re.IGNORECASE) + + +@dataclass +class AgentOptions: + max_retries: int = 3 + retry_delay_ms: int = 1000 + strict: bool = False + + +@dataclass +class StepLog: + step: int + action: str + status: str + detail: str + output: Optional[str] = None + + +async def run_agent( + *, + query: Optional[str], + steps: Optional[List[Dict[str, Any]]], + moondream: MoondreamClient, + kernel: Kernel, + session_id: str, + options: AgentOptions, +) -> Dict[str, Any]: + computer = ComputerTool(kernel, session_id) + + parsed_steps = steps or parse_steps(query or "") + if not parsed_steps: + raise ValueError("No steps could be derived from the query. Provide steps or a query.") + + logs: List[StepLog] = [] + answers: List[str] = [] + last_screenshot: Optional[str] = None + error: Optional[str] = None + + for index, step in enumerate(parsed_steps, start=1): + action = str(step.get("action", "")).strip().lower() + if not action: + logs.append(StepLog(index, "unknown", "failed", "Missing action")) + if options.strict: + error = "Missing action in step" + break + continue + + try: + if step.get("pre_wait_ms"): + await asyncio.sleep(float(step["pre_wait_ms"]) / 1000) + + if action in {"open_web_browser", "open"}: + result = await computer.execute_action(ComputerAction.OPEN_WEB_BROWSER, {}) + last_screenshot = _update_screenshot(result, last_screenshot) + logs.append(StepLog(index, action, _status(result), "Opened browser")) + + elif action == "navigate": + url = step.get("url") or _find_url(query or "") + if not url: + raise ValueError("navigate requires url") + result = await computer.execute_action(ComputerAction.NAVIGATE, {"url": url}) + last_screenshot = _update_screenshot(result, last_screenshot) + logs.append(StepLog(index, action, _status(result), f"Navigated to {url}")) + + elif action == "go_back": + result = await computer.execute_action(ComputerAction.GO_BACK, {}) + last_screenshot = _update_screenshot(result, last_screenshot) + logs.append(StepLog(index, action, _status(result), "Went back")) + + elif action == "go_forward": + result = await computer.execute_action(ComputerAction.GO_FORWARD, {}) + last_screenshot = _update_screenshot(result, last_screenshot) + logs.append(StepLog(index, action, _status(result), "Went forward")) + + elif action == "search": + result = await computer.execute_action(ComputerAction.SEARCH, {}) + last_screenshot = _update_screenshot(result, last_screenshot) + logs.append(StepLog(index, action, _status(result), "Focused address bar")) + + elif action == "wait": + seconds = float(step.get("seconds", 1)) + await asyncio.sleep(seconds) + logs.append(StepLog(index, action, "success", f"Waited {seconds:.2f}s")) + + elif action == "key": + keys = step.get("keys") + if not keys: + raise ValueError("key action requires keys") + result = await computer.execute_action(ComputerAction.KEY_COMBINATION, {"keys": keys}) + last_screenshot = _update_screenshot(result, last_screenshot) + logs.append(StepLog(index, action, _status(result), f"Pressed {keys}")) + + elif action == "scroll": + direction = step.get("direction", "down") + magnitude = step.get("magnitude") + if "x" in step and "y" in step: + x_norm, y_norm = normalize_point( + float(step["x"]), + float(step["y"]), + computer.screen_size, + ) + args: Dict[str, Any] = { + "x": x_norm, + "y": y_norm, + "direction": direction, + } + if magnitude is not None: + args["magnitude"] = int(magnitude) + result = await computer.execute_action(ComputerAction.SCROLL_AT, args) + else: + args = {"direction": direction} + if magnitude is not None: + args["magnitude"] = int(magnitude) + result = await computer.execute_action(ComputerAction.SCROLL_DOCUMENT, args) + last_screenshot = _update_screenshot(result, last_screenshot) + logs.append(StepLog(index, action, _status(result), f"Scrolled {direction}")) + + elif action in {"click", "type"}: + target = step.get("target") + retries = int(step.get("retries", options.max_retries)) + delay_ms = int(step.get("retry_delay_ms", options.retry_delay_ms)) + + coords = await _resolve_target_coords( + step, + target, + moondream, + computer, + last_screenshot, + retries, + delay_ms, + ) + + if not coords: + raise ValueError(f"Unable to locate target: {target}") + + x_norm, y_norm = coords + if action == "click": + result = await computer.execute_action( + ComputerAction.CLICK_AT, + {"x": x_norm, "y": y_norm}, + ) + last_screenshot = _update_screenshot(result, last_screenshot) + logs.append(StepLog(index, action, _status(result), f"Clicked {target}")) + else: + text = step.get("text") + if text is None: + raise ValueError("type action requires text") + result = await computer.execute_action( + ComputerAction.TYPE_TEXT_AT, + { + "x": x_norm, + "y": y_norm, + "text": str(text), + "press_enter": bool(step.get("press_enter", False)), + "clear_before_typing": bool(step.get("clear_before_typing", True)), + }, + ) + last_screenshot = _update_screenshot(result, last_screenshot) + logs.append( + StepLog(index, action, _status(result), f"Typed into {target}") + ) + + elif action == "query": + question = step.get("question") or query + if not question: + raise ValueError("query action requires question") + screenshot = await _ensure_screenshot(computer, last_screenshot) + last_screenshot = screenshot + answer = await moondream.query(screenshot, str(question)) + answers.append(answer) + logs.append(StepLog(index, action, "success", "Answered question", output=answer)) + + elif action == "caption": + length = step.get("length", "normal") + screenshot = await _ensure_screenshot(computer, last_screenshot) + last_screenshot = screenshot + caption = await moondream.caption(screenshot, str(length)) + answers.append(caption) + logs.append(StepLog(index, action, "success", "Generated caption", output=caption)) + + else: + raise ValueError(f"Unknown action: {action}") + + except Exception as exc: + message = str(exc) + logs.append(StepLog(index, action, "failed", message)) + error = message + if options.strict: + break + + summary = f"Completed {sum(1 for log in logs if log.status == 'success')}/{len(logs)} steps" + result_payload = { + "summary": summary, + "steps": [log.__dict__ for log in logs], + "answers": answers, + } + + return { + "final_response": json.dumps(result_payload, indent=2), + "error": error, + } + + +def parse_steps(query: str) -> List[Dict[str, Any]]: + query = query.strip() + if not query: + return [] + + if query.startswith("{") or query.startswith("["): + try: + data = json.loads(query) + if isinstance(data, list): + return data + if isinstance(data, dict) and isinstance(data.get("steps"), list): + return data["steps"] + except json.JSONDecodeError: + pass + + steps: List[Dict[str, Any]] = [] + url = _find_url(query) + if url: + steps.append({"action": "navigate", "url": url}) + + question = _strip_url_and_navigation(query) + wants_caption = any(term in query.lower() for term in ["describe", "caption"]) + + if wants_caption: + steps.append({"action": "caption"}) + elif question: + steps.append({"action": "query", "question": question}) + elif url: + steps.append({"action": "caption"}) + else: + steps.append({"action": "query", "question": query}) + + return steps + + +def _find_url(query: str) -> Optional[str]: + match = URL_RE.search(query) + return match.group(0) if match else None + + +def _strip_url_and_navigation(query: str) -> str: + cleaned = URL_RE.sub("", query) + cleaned = re.sub(r"\b(navigate|open|go|visit)\b", "", cleaned, flags=re.IGNORECASE) + cleaned = cleaned.replace("to", " ") + cleaned = re.sub(r"\s+", " ", cleaned).strip(" ,.;:-") + return cleaned + + +def normalize_point(x: float, y: float, screen_size: Optional[ScreenSize] = None) -> Tuple[int, int]: + if 0 <= x <= 1 and 0 <= y <= 1: + return int(x * COORDINATE_SCALE), int(y * COORDINATE_SCALE) + width = screen_size.width if screen_size else DEFAULT_SCREEN_SIZE.width + height = screen_size.height if screen_size else DEFAULT_SCREEN_SIZE.height + return int((x / width) * COORDINATE_SCALE), int((y / height) * COORDINATE_SCALE) + + +def _status(result: Any) -> str: + return "failed" if result and getattr(result, "error", None) else "success" + + +def _update_screenshot(result: Any, last_screenshot: Optional[str]) -> Optional[str]: + if result and getattr(result, "base64_image", None): + return result.base64_image + return last_screenshot + + +async def _ensure_screenshot(computer: ComputerTool, last_screenshot: Optional[str]) -> str: + if last_screenshot: + return last_screenshot + result = await computer.screenshot() + if result.error or not result.base64_image: + raise RuntimeError(result.error or "Failed to capture screenshot") + return result.base64_image + + +async def _resolve_target_coords( + step: Dict[str, Any], + target: Optional[str], + moondream: MoondreamClient, + computer: ComputerTool, + last_screenshot: Optional[str], + retries: int, + delay_ms: int, +) -> Optional[Tuple[int, int]]: + if "x" in step and "y" in step: + return normalize_point(float(step["x"]), float(step["y"]), computer.screen_size) + + if not target: + return None + + attempts = max(1, retries) + for attempt in range(attempts): + screenshot = await _ensure_screenshot(computer, last_screenshot) + point = await moondream.point(screenshot, str(target)) + if point: + return normalize_point(point.x, point.y, computer.screen_size) + if attempt < attempts - 1: + await asyncio.sleep(delay_ms / 1000) + last_screenshot = None + + return None diff --git a/pkg/templates/python/moondream-groq-computer-use/main.py b/pkg/templates/python/moondream-groq-computer-use/main.py new file mode 100644 index 0000000..6e807f0 --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/main.py @@ -0,0 +1,152 @@ +import os +from typing import List, Optional, TypedDict + +import kernel +from llm_loop import LlmOptions, run_llm_agent +from loop import AgentOptions, run_agent +from moondream import MoondreamClient +from session import KernelBrowserSession +from tools import ComputerTool + + +class StepInput(TypedDict, total=False): + action: str + url: str + target: str + text: str + question: str + direction: str + magnitude: int + x: float + y: float + keys: str + seconds: float + retries: int + retry_delay_ms: int + pre_wait_ms: int + press_enter: bool + clear_before_typing: bool + length: str + + +class QueryInput(TypedDict, total=False): + query: str + steps: List[StepInput] + record_replay: bool + max_retries: int + retry_delay_ms: int + strict: bool + max_iterations: int + post_action_wait_ms: int + + +class QueryOutput(TypedDict): + result: str + replay_url: Optional[str] + error: Optional[str] + + +api_key = os.getenv("MOONDREAM_API_KEY") +if not api_key: + raise ValueError( + "MOONDREAM_API_KEY is not set. " + "Set it via environment variable or deploy with: kernel deploy main.py --env-file .env" + ) +groq_key = os.getenv("GROQ_API_KEY") + +app = kernel.App("python-moondream-cua") + + +@app.action("cua-task") +async def cua_task( + ctx: kernel.KernelContext, + payload: QueryInput, +) -> QueryOutput: + if not payload or not (payload.get("query") or payload.get("steps")): + raise ValueError( + "Query is required. Payload must include: {\"query\": \"your task description\"}" + ) + + record_replay = payload.get("record_replay", False) + options = AgentOptions( + max_retries=int(payload.get("max_retries", 3)), + retry_delay_ms=int(payload.get("retry_delay_ms", 1000)), + strict=bool(payload.get("strict", False)), + ) + llm_options = LlmOptions( + max_iterations=int(payload.get("max_iterations", 40)), + post_action_wait_ms=int(payload.get("post_action_wait_ms", 500)), + ) + + async with KernelBrowserSession( + stealth=True, + record_replay=record_replay, + ) as session: + print("Kernel browser live view url:", session.live_view_url) + + async with MoondreamClient(api_key=str(api_key)) as moondream: + if payload.get("steps"): + result = await run_agent( + query=payload.get("query"), + steps=payload.get("steps"), + moondream=moondream, + kernel=session.kernel, + session_id=session.session_id, + options=options, + ) + else: + if not groq_key: + raise ValueError( + "GROQ_API_KEY is not set. " + "Set it via environment variable or deploy with: kernel deploy main.py --env-file .env" + ) + result = await run_llm_agent( + query=str(payload.get("query")), + moondream=moondream, + kernel_tool=ComputerTool(session.kernel, session.session_id), + groq_api_key=str(groq_key), + options=llm_options, + ) + + return { + "result": result.get("final_response", ""), + "replay_url": session.replay_view_url, + "error": result.get("error"), + } + + +if __name__ == "__main__": + import asyncio + + async def main(): + test_query = "Navigate to https://example.com and describe the page" + + print(f"Running local test with query: {test_query}") + + async with KernelBrowserSession( + stealth=True, + record_replay=False, + ) as session: + print("Kernel browser live view url:", session.live_view_url) + + async with MoondreamClient(api_key=str(api_key)) as moondream: + try: + if not groq_key: + raise ValueError("GROQ_API_KEY is required for local LLM test") + + result = await run_llm_agent( + query=test_query, + moondream=moondream, + kernel_tool=ComputerTool(session.kernel, session.session_id), + groq_api_key=str(groq_key), + options=LlmOptions(), + ) + + print("Result:", result.get("final_response", "")) + if result.get("error"): + print("Error:", result.get("error")) + except Exception as e: + print(f"Local execution failed: {e}") + raise + + asyncio.run(main()) diff --git a/pkg/templates/python/moondream-groq-computer-use/moondream.py b/pkg/templates/python/moondream-groq-computer-use/moondream.py new file mode 100644 index 0000000..12a8cc1 --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/moondream.py @@ -0,0 +1,136 @@ +"""Async client for the Moondream API.""" + +from __future__ import annotations + +import base64 +from dataclasses import dataclass +from typing import Any, Optional + +import httpx + + +@dataclass +class MoondreamPoint: + x: float + y: float + + +class MoondreamError(RuntimeError): + pass + + +class MoondreamClient: + def __init__( + self, + api_key: str, + base_url: str = "https://api.moondream.ai/v1", + timeout: float = 30.0, + ) -> None: + self._client = httpx.AsyncClient( + base_url=base_url, + timeout=timeout, + headers={ + "Content-Type": "application/json", + "X-Moondream-Auth": api_key, + }, + ) + + async def __aenter__(self) -> "MoondreamClient": + return self + + async def __aexit__(self, exc_type, exc, tb) -> None: + await self.close() + + async def close(self) -> None: + await self._client.aclose() + + async def query(self, image_base64: str, question: str, reasoning: Optional[bool] = None) -> str: + payload: dict[str, Any] = { + "image_url": _to_data_url(image_base64), + "question": question, + } + if reasoning is not None: + payload["reasoning"] = reasoning + + data = await self._post("/query", payload) + answer = data.get("answer") + if not isinstance(answer, str): + raise MoondreamError("Moondream query returned an invalid response") + return answer + + async def caption(self, image_base64: str, length: str = "normal") -> str: + payload = { + "image_url": _to_data_url(image_base64), + "length": length, + "stream": False, + } + data = await self._post("/caption", payload) + caption = data.get("caption") + if not isinstance(caption, str): + raise MoondreamError("Moondream caption returned an invalid response") + return caption + + async def point(self, image_base64: str, obj: str) -> Optional[MoondreamPoint]: + payload = { + "image_url": _to_data_url(image_base64), + "object": obj, + } + data = await self._post("/point", payload) + points = data.get("points") + if not isinstance(points, list) or not points: + return None + point = points[0] + if not isinstance(point, dict): + return None + x = point.get("x") + y = point.get("y") + if not isinstance(x, (int, float)) or not isinstance(y, (int, float)): + return None + return MoondreamPoint(x=float(x), y=float(y)) + + async def detect(self, image_base64: str, obj: str) -> list[dict[str, float]]: + payload = { + "image_url": _to_data_url(image_base64), + "object": obj, + } + data = await self._post("/detect", payload) + objects = data.get("objects") + if not isinstance(objects, list): + return [] + results: list[dict[str, float]] = [] + for item in objects: + if not isinstance(item, dict): + continue + x_min = item.get("x_min") + y_min = item.get("y_min") + x_max = item.get("x_max") + y_max = item.get("y_max") + if all(isinstance(v, (int, float)) for v in (x_min, y_min, x_max, y_max)): + results.append( + { + "x_min": float(x_min), + "y_min": float(y_min), + "x_max": float(x_max), + "y_max": float(y_max), + } + ) + return results + + async def _post(self, path: str, payload: dict[str, Any]) -> dict[str, Any]: + response = await self._client.post(path, json=payload) + if response.status_code >= 400: + text = response.text + raise MoondreamError(f"Moondream API error {response.status_code}: {text}") + data = response.json() + if not isinstance(data, dict): + raise MoondreamError("Moondream API returned unexpected response type") + return data + + +def _to_data_url(image_base64: str) -> str: + # Kernel screenshots are PNG by default + return f"data:image/png;base64,{image_base64}" + + +def encode_image_bytes(image_bytes: bytes) -> str: + return base64.b64encode(image_bytes).decode("utf-8") diff --git a/pkg/templates/python/moondream-groq-computer-use/pyproject.toml b/pkg/templates/python/moondream-groq-computer-use/pyproject.toml new file mode 100644 index 0000000..a025f18 --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/pyproject.toml @@ -0,0 +1,16 @@ +[project] +name = "python-moondream-cua" +version = "0.1.0" +description = "Moondream computer use agent for Kernel" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "groq", + "httpx", + "json-repair", + "kernel", + "playwright", +] + +[tool.uv] +dev-dependencies = [] diff --git a/pkg/templates/python/moondream-groq-computer-use/session.py b/pkg/templates/python/moondream-groq-computer-use/session.py new file mode 100644 index 0000000..a39d210 --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/session.py @@ -0,0 +1,130 @@ +""" +Kernel Browser Session Manager. + +Provides an async context manager for managing Kernel browser lifecycle +with optional video replay recording. +""" + +import asyncio +import time +from dataclasses import dataclass, field +from typing import Optional + +from kernel import Kernel +from tools import DEFAULT_SCREEN_SIZE + + +@dataclass +class KernelBrowserSession: + stealth: bool = True + timeout_seconds: int = 600 + + # Replay recording options + record_replay: bool = False + replay_grace_period: float = 5.0 # Seconds to wait before stopping replay + + # Set after browser creation + session_id: Optional[str] = field(default=None, init=False) + live_view_url: Optional[str] = field(default=None, init=False) + replay_id: Optional[str] = field(default=None, init=False) + replay_view_url: Optional[str] = field(default=None, init=False) + _kernel: Optional[Kernel] = field(default=None, init=False) + + async def __aenter__(self) -> "KernelBrowserSession": + self._kernel = Kernel() + + # Create browser with specified settings + browser = self._kernel.browsers.create( + stealth=self.stealth, + timeout_seconds=self.timeout_seconds, + viewport={ + "width": DEFAULT_SCREEN_SIZE.width, + "height": DEFAULT_SCREEN_SIZE.height, + }, + ) + + self.session_id = browser.session_id + self.live_view_url = browser.browser_live_view_url + + print(f"Kernel browser created: {self.session_id}") + print(f"Live view URL: {self.live_view_url}") + + # Start replay recording if enabled + if self.record_replay: + try: + await self._start_replay() + except Exception as e: + print(f"Warning: Failed to start replay recording: {e}") + print("Continuing without replay recording.") + + return self + + async def _start_replay(self) -> None: + if not self._kernel or not self.session_id: + return + + print("Starting replay recording...") + replay = self._kernel.browsers.replays.start(self.session_id) + self.replay_id = replay.replay_id + print(f"Replay recording started: {self.replay_id}") + + async def _stop_and_get_replay_url(self) -> None: + if not self._kernel or not self.session_id or not self.replay_id: + return + + print("Stopping replay recording...") + self._kernel.browsers.replays.stop( + replay_id=self.replay_id, + id=self.session_id, + ) + print("Replay recording stopped. Processing video...") + + # Wait a moment for processing + await asyncio.sleep(2) + + # Poll for replay to be ready (with timeout) + max_wait = 60 # seconds + start_time = time.time() + replay_ready = False + + while time.time() - start_time < max_wait: + try: + replays = self._kernel.browsers.replays.list(self.session_id) + for replay in replays: + if replay.replay_id == self.replay_id: + self.replay_view_url = replay.replay_view_url + replay_ready = True + break + if replay_ready: + break + except Exception: + pass + await asyncio.sleep(1) + + if not replay_ready: + print("Warning: Replay may still be processing") + elif self.replay_view_url: + print(f"Replay view URL: {self.replay_view_url}") + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + if self._kernel and self.session_id: + try: + # Stop replay if recording was enabled + if self.record_replay and self.replay_id: + # Wait grace period before stopping to capture final state + if self.replay_grace_period > 0: + print(f"Waiting {self.replay_grace_period}s grace period...") + await asyncio.sleep(self.replay_grace_period) + await self._stop_and_get_replay_url() + finally: + print(f"Destroying browser session: {self.session_id}") + self._kernel.browsers.delete_by_id(self.session_id) + print("Browser session destroyed.") + + self._kernel = None + + @property + def kernel(self) -> Kernel: + if self._kernel is None: + raise RuntimeError("Session not initialized. Use async with context.") + return self._kernel diff --git a/pkg/templates/python/moondream-groq-computer-use/tools/__init__.py b/pkg/templates/python/moondream-groq-computer-use/tools/__init__.py new file mode 100644 index 0000000..dc7423e --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/tools/__init__.py @@ -0,0 +1,23 @@ +"""Computer use tools package.""" + +from .computer import ComputerTool +from .types import ( + ComputerAction, + ComputerFunctionArgs, + PREDEFINED_COMPUTER_USE_FUNCTIONS, + ToolResult, + ScreenSize, + DEFAULT_SCREEN_SIZE, + COORDINATE_SCALE, +) + +__all__ = [ + "ComputerTool", + "ComputerAction", + "ComputerFunctionArgs", + "PREDEFINED_COMPUTER_USE_FUNCTIONS", + "ToolResult", + "ScreenSize", + "DEFAULT_SCREEN_SIZE", + "COORDINATE_SCALE", +] diff --git a/pkg/templates/python/moondream-groq-computer-use/tools/computer.py b/pkg/templates/python/moondream-groq-computer-use/tools/computer.py new file mode 100644 index 0000000..ad60876 --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/tools/computer.py @@ -0,0 +1,276 @@ +""" +Computer Tool - Maps high-level actions to Kernel Computer Controls API. +""" + +import asyncio +import base64 + +from kernel import Kernel + +from .types import ( + ComputerAction, + ComputerFunctionArgs, + PREDEFINED_COMPUTER_USE_FUNCTIONS, + DEFAULT_SCREEN_SIZE, + COORDINATE_SCALE, + ToolResult, + ScreenSize, +) + + +TYPING_DELAY_MS = 12 +SCREENSHOT_DELAY_SECS = 0.5 + + +class ComputerTool: + def __init__( + self, + kernel: Kernel, + session_id: str, + screen_size: ScreenSize = DEFAULT_SCREEN_SIZE, + ): + self.kernel = kernel + self.session_id = session_id + self.screen_size = screen_size + + def denormalize_x(self, x: int) -> int: + return int((x / COORDINATE_SCALE) * self.screen_size.width) + + def denormalize_y(self, y: int) -> int: + return int((y / COORDINATE_SCALE) * self.screen_size.height) + + async def screenshot(self) -> ToolResult: + try: + await asyncio.sleep(SCREENSHOT_DELAY_SECS) + response = self.kernel.browsers.computer.capture_screenshot(self.session_id) + screenshot_bytes = response.read() + dimensions = _parse_png_dimensions(screenshot_bytes) + if dimensions: + self.screen_size = dimensions + + return ToolResult( + base64_image=base64.b64encode(screenshot_bytes).decode("utf-8"), + url="about:blank", + width=dimensions.width if dimensions else None, + height=dimensions.height if dimensions else None, + ) + except Exception as e: + return ToolResult(error=f"Failed to take screenshot: {e}", url="about:blank") + + async def execute_action( + self, action_name: str, args: ComputerFunctionArgs + ) -> ToolResult: + if action_name not in [a.value for a in PREDEFINED_COMPUTER_USE_FUNCTIONS]: + return ToolResult(error=f"Unknown action: {action_name}") + + try: + if action_name == ComputerAction.OPEN_WEB_BROWSER: + # Browser is already open in Kernel, just return screenshot + pass + + elif action_name == ComputerAction.CLICK_AT: + if "x" not in args or "y" not in args: + return ToolResult(error="click_at requires x and y coordinates") + x = self.denormalize_x(args["x"]) + y = self.denormalize_y(args["y"]) + num_clicks = int(args.get("clicks", 1)) if args.get("clicks") else 1 + self.kernel.browsers.computer.click_mouse( + self.session_id, + x=x, + y=y, + button="left", + click_type="click", + num_clicks=num_clicks, + ) + + elif action_name == ComputerAction.HOVER_AT: + if "x" not in args or "y" not in args: + return ToolResult(error="hover_at requires x and y coordinates") + x = self.denormalize_x(args["x"]) + y = self.denormalize_y(args["y"]) + self.kernel.browsers.computer.move_mouse( + self.session_id, x=x, y=y + ) + + elif action_name == ComputerAction.TYPE_TEXT_AT: + if "x" not in args or "y" not in args: + return ToolResult(error="type_text_at requires x and y coordinates") + if "text" not in args: + return ToolResult(error="type_text_at requires text") + + x = self.denormalize_x(args["x"]) + y = self.denormalize_y(args["y"]) + + self.kernel.browsers.computer.click_mouse( + self.session_id, + x=x, + y=y, + button="left", + click_type="click", + num_clicks=1, + ) + + if args.get("clear_before_typing", True): + self.kernel.browsers.computer.press_key( + self.session_id, keys=["ctrl+a"] + ) + await asyncio.sleep(0.05) + + self.kernel.browsers.computer.type_text( + self.session_id, + text=args["text"], + delay=TYPING_DELAY_MS, + ) + + if args.get("press_enter", False): + await asyncio.sleep(0.1) + self.kernel.browsers.computer.press_key( + self.session_id, keys=["Return"] + ) + + elif action_name == ComputerAction.SCROLL_DOCUMENT: + if "direction" not in args: + return ToolResult(error="scroll_document requires direction") + center_x = self.screen_size.width // 2 + center_y = self.screen_size.height // 2 + scroll_delta = 500 + + delta_x, delta_y = 0, 0 + direction = args["direction"] + if direction == "down": + delta_y = scroll_delta + elif direction == "up": + delta_y = -scroll_delta + elif direction == "right": + delta_x = scroll_delta + elif direction == "left": + delta_x = -scroll_delta + + self.kernel.browsers.computer.scroll( + self.session_id, + x=center_x, + y=center_y, + delta_x=delta_x, + delta_y=delta_y, + ) + + elif action_name == ComputerAction.SCROLL_AT: + if "x" not in args or "y" not in args: + return ToolResult(error="scroll_at requires x and y coordinates") + if "direction" not in args: + return ToolResult(error="scroll_at requires direction") + + x = self.denormalize_x(args["x"]) + y = self.denormalize_y(args["y"]) + + magnitude = args.get("magnitude", 800) + direction = args["direction"] + if direction in ("up", "down"): + magnitude = self.denormalize_y(magnitude) + else: + magnitude = self.denormalize_x(magnitude) + + delta_x, delta_y = 0, 0 + if direction == "down": + delta_y = magnitude + elif direction == "up": + delta_y = -magnitude + elif direction == "right": + delta_x = magnitude + elif direction == "left": + delta_x = -magnitude + + self.kernel.browsers.computer.scroll( + self.session_id, + x=x, + y=y, + delta_x=delta_x, + delta_y=delta_y, + ) + + elif action_name == ComputerAction.WAIT_5_SECONDS: + await asyncio.sleep(5) + + elif action_name == ComputerAction.GO_BACK: + self.kernel.browsers.computer.press_key( + self.session_id, keys=["alt+Left"] + ) + await asyncio.sleep(1) + + elif action_name == ComputerAction.GO_FORWARD: + self.kernel.browsers.computer.press_key( + self.session_id, keys=["alt+Right"] + ) + await asyncio.sleep(1) + + elif action_name == ComputerAction.SEARCH: + self.kernel.browsers.computer.press_key( + self.session_id, keys=["ctrl+l"] + ) + + elif action_name == ComputerAction.NAVIGATE: + if "url" not in args: + return ToolResult(error="navigate requires url") + self.kernel.browsers.computer.press_key( + self.session_id, keys=["ctrl+l"] + ) + await asyncio.sleep(0.1) + self.kernel.browsers.computer.type_text( + self.session_id, + text=args["url"], + delay=TYPING_DELAY_MS, + ) + await asyncio.sleep(0.1) + self.kernel.browsers.computer.press_key( + self.session_id, keys=["Return"] + ) + await asyncio.sleep(1.5) + + elif action_name == ComputerAction.KEY_COMBINATION: + if "keys" not in args: + return ToolResult(error="key_combination requires keys") + keys = str(args["keys"]) + if keys.lower() == "enter": + keys = "Return" + self.kernel.browsers.computer.press_key( + self.session_id, keys=[keys] + ) + + elif action_name == ComputerAction.DRAG_AND_DROP: + required = ["x", "y", "destination_x", "destination_y"] + if not all(k in args for k in required): + return ToolResult( + error="drag_and_drop requires x, y, destination_x, and destination_y" + ) + + start_x = self.denormalize_x(args["x"]) + start_y = self.denormalize_y(args["y"]) + end_x = self.denormalize_x(args["destination_x"]) + end_y = self.denormalize_y(args["destination_y"]) + + self.kernel.browsers.computer.drag_mouse( + self.session_id, + path=[[start_x, start_y], [end_x, end_y]], + button="left", + ) + + else: + return ToolResult(error=f"Unhandled action: {action_name}") + + await asyncio.sleep(SCREENSHOT_DELAY_SECS) + return await self.screenshot() + + except Exception as e: + return ToolResult(error=f"Action failed: {e}", url="about:blank") + + +def _parse_png_dimensions(data: bytes) -> ScreenSize | None: + if len(data) < 24: + return None + if data[:8] != b"\x89PNG\r\n\x1a\n": + return None + width = int.from_bytes(data[16:20], "big") + height = int.from_bytes(data[20:24], "big") + if width <= 0 or height <= 0: + return None + return ScreenSize(width=width, height=height) diff --git a/pkg/templates/python/moondream-groq-computer-use/tools/types.py b/pkg/templates/python/moondream-groq-computer-use/tools/types.py new file mode 100644 index 0000000..9b29a35 --- /dev/null +++ b/pkg/templates/python/moondream-groq-computer-use/tools/types.py @@ -0,0 +1,85 @@ +""" +Type definitions for computer use actions. +""" + +from dataclasses import dataclass +from enum import StrEnum +from typing import Literal, Optional, TypedDict + + +class ComputerAction(StrEnum): + OPEN_WEB_BROWSER = "open_web_browser" + CLICK_AT = "click_at" + HOVER_AT = "hover_at" + TYPE_TEXT_AT = "type_text_at" + SCROLL_DOCUMENT = "scroll_document" + SCROLL_AT = "scroll_at" + WAIT_5_SECONDS = "wait_5_seconds" + GO_BACK = "go_back" + GO_FORWARD = "go_forward" + SEARCH = "search" + NAVIGATE = "navigate" + KEY_COMBINATION = "key_combination" + DRAG_AND_DROP = "drag_and_drop" + + +# Derive from enum to prevent drift when adding new actions +PREDEFINED_COMPUTER_USE_FUNCTIONS = list(ComputerAction) + + +ScrollDirection = Literal["up", "down", "left", "right"] + + +class SafetyDecision(TypedDict, total=False): + decision: str + explanation: str + + +class ComputerFunctionArgs(TypedDict, total=False): + # click_at, hover_at, scroll_at + x: int + y: int + clicks: int + + # type_text_at + text: str + press_enter: bool + clear_before_typing: bool + + # scroll_document, scroll_at + direction: ScrollDirection + magnitude: int + + # navigate + url: str + + # key_combination + keys: str + + # drag_and_drop + destination_x: int + destination_y: int + + # Safety decision (may be included in any function call) + safety_decision: SafetyDecision + + +@dataclass +class ToolResult: + base64_image: Optional[str] = None + url: Optional[str] = None + error: Optional[str] = None + width: Optional[int] = None + height: Optional[int] = None + + +@dataclass +class ScreenSize: + width: int + height: int + + +DEFAULT_SCREEN_SIZE = ScreenSize(width=1200, height=800) + +# Normalized coordinates scale (0-1000) +COORDINATE_SCALE = 1000 diff --git a/pkg/templates/typescript/moondream-groq-computer-use/.env.example b/pkg/templates/typescript/moondream-groq-computer-use/.env.example new file mode 100644 index 0000000..880a1af --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/.env.example @@ -0,0 +1,2 @@ +MOONDREAM_API_KEY= +GROQ_API_KEY= diff --git a/pkg/templates/typescript/moondream-groq-computer-use/README.md b/pkg/templates/typescript/moondream-groq-computer-use/README.md new file mode 100644 index 0000000..f268002 --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/README.md @@ -0,0 +1,64 @@ +# Kernel TypeScript Sample App - Moondream + Groq Computer Use + +This Kernel app runs a lightweight computer-use agent powered by Moondream vision models, Groq fast LLM orchestration. + +## Setup + +1. Get your API keys: + - **Moondream**: [moondream.ai](https://moondream.ai) + - **Groq**: [console.groq.com](https://console.groq.com) + +2. Deploy the app: +```bash +kernel login +cp .env.example .env # Add your MOONDREAM_API_KEY and GROQ_API_KEY +kernel deploy index.ts --env-file .env +``` + +## Usage + +Natural-language query (Groq LLM orchestrates Moondream + Kernel): +```bash +kernel invoke ts-moondream-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}' +``` + +Structured steps (optional fallback for deterministic automation): +```bash +kernel invoke ts-moondream-cua cua-task --payload '{ + "steps": [ + {"action": "navigate", "url": "https://example.com"}, + {"action": "caption"}, + {"action": "click", "target": "More information link", "retries": 4}, + {"action": "type", "target": "Search input", "text": "kernel", "press_enter": true} + ] +}' +``` + +## Step Actions + +Each step is a JSON object with an `action` field. Supported actions: + +- `navigate`: `{ "url": "https://..." }` +- `click`: `{ "target": "Button label or description" }` +- `type`: `{ "target": "Input field description", "text": "...", "press_enter": false }` +- `scroll`: `{ "direction": "down" }` or `{ "x": 0.5, "y": 0.5, "direction": "down" }` +- `query`: `{ "question": "Is there a login button?" }` +- `caption`: `{ "length": "short" | "normal" | "long" }` +- `wait`: `{ "seconds": 2.5 }` +- `key`: `{ "keys": "ctrl+l" }` +- `go_back`, `go_forward`, `search`, `open_web_browser` + +Optional step fields: +- `retries`: override retry attempts for point/click/type +- `retry_delay_ms`: wait between retries +- `x`, `y`: normalized (0-1) or pixel coordinates to bypass Moondream pointing (pixel coords use detected screenshot size) + +## Replay Recording + +Add `"record_replay": true` to the payload to capture a video replay. + +## Notes + +- The agent uses Moondream for visual reasoning and pointing. +- Kernel screenshots are PNG; Moondream queries are sent as base64 data URLs. +- The Groq LLM orchestrates JSON actions; the agent repairs and parses JSON with jsonrepair. diff --git a/pkg/templates/typescript/moondream-groq-computer-use/_gitignore b/pkg/templates/typescript/moondream-groq-computer-use/_gitignore new file mode 100644 index 0000000..095f573 --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/_gitignore @@ -0,0 +1,39 @@ +# Dependencies +node_modules/ +package-lock.json + +# TypeScript +*.tsbuildinfo +dist/ +build/ + +# Environment +.env +.env.local +.env.*.local + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Logs +logs/ +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Testing +coverage/ +.nyc_output/ + +# Misc +.cache/ +.temp/ +.tmp/ diff --git a/pkg/templates/typescript/moondream-groq-computer-use/index.ts b/pkg/templates/typescript/moondream-groq-computer-use/index.ts new file mode 100644 index 0000000..49bc85d --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/index.ts @@ -0,0 +1,144 @@ +import { Kernel, type KernelContext } from '@onkernel/sdk'; +import { runAgent, type AgentOptions, type StepInput } from './loop'; +import { runLlmAgent, type LlmOptions } from './llm_loop'; +import { KernelBrowserSession } from './session'; +import { MoondreamClient } from './moondream'; + +const kernel = new Kernel(); + +const app = kernel.app('ts-moondream-cua'); + +interface QueryInput { + query?: string; + steps?: StepInput[]; + record_replay?: boolean; + max_retries?: number; + retry_delay_ms?: number; + strict?: boolean; + max_iterations?: number; + post_action_wait_ms?: number; +} + +interface QueryOutput { + result: string; + replay_url?: string; + error?: string; +} + +const MOONDREAM_API_KEY = process.env.MOONDREAM_API_KEY; +const GROQ_API_KEY = process.env.GROQ_API_KEY; + +if (!MOONDREAM_API_KEY) { + throw new Error( + 'MOONDREAM_API_KEY is not set. ' + + 'Set it via environment variable or deploy with: kernel deploy index.ts --env-file .env' + ); +} + +app.action( + 'cua-task', + async (ctx: KernelContext, payload?: QueryInput): Promise => { + if (!payload?.query && !payload?.steps?.length) { + throw new Error('Query is required. Payload must include: { "query": "your task description" }'); + } + + const options: AgentOptions = { + maxRetries: payload.max_retries, + retryDelayMs: payload.retry_delay_ms, + strict: payload.strict, + }; + const llmOptions: LlmOptions = { + maxIterations: payload.max_iterations, + postActionWaitMs: payload.post_action_wait_ms, + }; + + const session = new KernelBrowserSession(kernel, { + stealth: true, + recordReplay: payload.record_replay ?? false, + }); + + await session.start(); + console.log('Kernel browser live view url:', session.liveViewUrl); + + const moondream = new MoondreamClient({ apiKey: MOONDREAM_API_KEY }); + + try { + const result = payload.steps?.length + ? await runAgent({ + query: payload.query, + steps: payload.steps, + moondream, + kernel, + sessionId: session.sessionId, + options, + }) + : await runLlmAgent({ + query: String(payload.query), + moondream, + kernel, + sessionId: session.sessionId, + groqApiKey: requireGroqKey(GROQ_API_KEY), + options: llmOptions, + }); + + const sessionInfo = await session.stop(); + + return { + result: result.finalResponse, + replay_url: sessionInfo.replayViewUrl, + error: result.error, + }; + } catch (error) { + console.error('Error in agent loop:', error); + await session.stop(); + throw error; + } + }, +); + +// Run locally if executed directly (not imported as a module) +// Execute via: npx tsx index.ts +if (import.meta.url === `file://${process.argv[1]}`) { + const testQuery = process.env.TEST_QUERY || 'Navigate to https://example.com and describe the page'; + + console.log('Running local test with query:', testQuery); + + const session = new KernelBrowserSession(kernel, { + stealth: true, + recordReplay: false, + }); + + session.start().then(async () => { + const moondream = new MoondreamClient({ apiKey: MOONDREAM_API_KEY }); + + try { + const result = await runLlmAgent({ + query: testQuery, + moondream, + kernel, + sessionId: session.sessionId, + groqApiKey: requireGroqKey(GROQ_API_KEY), + }); + console.log('Result:', result.finalResponse); + if (result.error) { + console.error('Error:', result.error); + } + } finally { + await session.stop(); + } + process.exit(0); + }).catch(error => { + console.error('Local execution failed:', error); + process.exit(1); + }); +} + +function requireGroqKey(key: string | undefined): string { + if (!key) { + throw new Error( + 'GROQ_API_KEY is not set. ' + + 'Set it via environment variable or deploy with: kernel deploy index.ts --env-file .env' + ); + } + return key; +} diff --git a/pkg/templates/typescript/moondream-groq-computer-use/llm_loop.ts b/pkg/templates/typescript/moondream-groq-computer-use/llm_loop.ts new file mode 100644 index 0000000..c1db44b --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/llm_loop.ts @@ -0,0 +1,797 @@ +import { Groq } from 'groq-sdk'; +import { jsonrepair } from 'jsonrepair'; +import type { Kernel } from '@onkernel/sdk'; +import { ComputerTool } from './tools/computer'; +import { + COORDINATE_SCALE, + ComputerAction, + DEFAULT_SCREEN_SIZE, + type ScreenSize, +} from './tools/types/computer'; +import type { MoondreamClient, MoondreamPoint } from './moondream'; + +const MODEL_NAME = 'openai/gpt-oss-120b'; + +const SYSTEM_PROMPT = [ + 'You are a browser-automation controller. You do NOT see images.', + 'You must decide actions and call Moondream for any visual understanding.', + 'Return ONLY a single JSON object that matches the schema below.', + 'Parsing note: the client will extract the substring between the first \'{\' and last \'}\' and run jsonrepair on it.', + 'Therefore, do NOT include any extra text before or after the JSON object.', + '', + 'Browser context:', + '- The browser is already open. Do NOT request an open_browser action.', + '', + 'Action policy:', + '- Bundle multiple actions when you can (e.g., navigate -> moondream_query).', + '- Use moondream_* actions for all visual understanding; keep queries short and specific.', + '- Never emit moondream_query without a clear question.', + '- Use click_at/type_text_at/scroll_at with coordinates in 0-1000 normalized scale.', + '- If you need coordinates, call moondream_point first.', + '- Prefer type_text_at with press_enter=true to submit searches; use key_combination mainly for shortcuts.', + '- You may include post_wait_ms in args to wait after an action (agent handles it).', + '- When a task asks for a link or page identity, use page_info after clicking or navigation.', + '- If your actions did not change state, reassess with a new Moondream question rather than repeating.', + '- If you need a specific item URL/details, open a specific item page (not a results list) and confirm it.', + '- If a click does not change the page, try a different target or use hover_at to reveal link text/URL.', + '- When opening an item, prefer clicking the title or image; verify you reached a detail page before returning its URL.', + '- If list items offer separate “comments/discussion” links and “title/article” links, click the title/article link unless the task explicitly asks for comments.', + '- On list pages with metadata/source links, click the title line (main link), not the source/domain/metadata line.', + '- If the task includes constraints, use on-screen evidence to select a qualifying item before answering.', + '- On list pages, identify a candidate item that matches constraints, then point to its title/image and click to open.', + '- Do not answer until you can confirm you are on the target page type (e.g., a single-item detail page).', + '- For “first/top result” tasks, click the topmost result item (not navigation, ads, or comments).', + '- When returning a URL, use the most recent page_info URL from the current page.', + '- Before final response for item-specific tasks, confirm the page type with moondream_query.', + '- If a click does not open the item, try a different target or a double-click by setting clicks: 2. If you suspect a new tab opened, use key_combination with ctrl+tab and re-check page_info.', + '- Use action result field state_changed to decide if a click/scroll had an effect; if false, adjust target or strategy.', + '- If the user specifies a site to search (e.g., Wikipedia), use that site\'s search first; only switch to another search engine if the site search fails.', + '- Never output placeholders like {{x}}, {{url}}, or in actions or final_response.', + '- Do not ask Moondream to infer the URL or page title; use page_info for those.', + '- If the task specifies a domain/URL, avoid leaving that domain unless the task explicitly requires it; if page_info shows an unexpected domain, go_back or navigate to the intended domain.', + '- If the task specifies a domain, your final_response URL must include that domain.', + '- After typing a search query, submit it (press_enter or search button). Avoid clicking unrelated suggestions or ads.', + '- For tasks like “first/top result,” ask Moondream to point at the first item or top result and click it.', + '- When moondream_point returns coordinates (x_norm/y_norm), use those exact numbers in click_at (x,y). Never use placeholders.', + '- Do not navigate to URLs derived from Moondream answers. Only navigate to URLs provided by the user or confirmed via page_info.', + '- If search results are not found after a couple of attempts, fallback to direct navigation to the most likely official page.', + '- Moondream query quality matters. Ask short, concrete, visual questions. Avoid vague or multi-part questions.', + '- When the task requires price or currency, verify the price on the detail page with a targeted Moondream query and return the exact text.', + '- For dense result grids, you may use moondream_detect with objects like "product image" or "item card" and click the topmost box.', + '- Never ask Moondream for a URL or link; only use page_info for URLs.', + '', + 'Moondream query examples (good vs bad):', + 'GOOD: "Is there a search box on this page?"', + 'BAD: "What should I do next?"', + 'GOOD: "What is the exact price shown for the highlighted item?"', + 'BAD: "Tell me everything about this page."', + 'GOOD: "Is this a single-item detail page?"', + 'BAD: "Is this page good?"', + 'GOOD: "Which button says \\"Sign in\\"?"', + 'BAD: "Find the right thing."', + 'BAD: "What is the URL for this page?"', + '', + 'Moondream query templates:', + '- Presence: "Is there a on the page?"', + '- Identification: "What is the exact text of the ?"', + '- Page type: "Is this a page?"', + '- Verification: "Does the page show the item I just clicked?"', + '- Result matching: "Which result shows the domain ?"', + '- If asked to use a search box, attempt a search interaction before using direct navigation; only fall back if stuck, and mention fallback in final_response.', + '- If the user requests JSON output, ensure final_response is valid JSON that matches the requested fields.', + '- When setting done=true, always include a non-empty final_response with concrete values (no placeholders like {{...}}).', + '- Stop when the task is complete by setting done=true and final_response.', + '', + 'JSON Schema:', + '{', + ' "$schema": "https://json-schema.org/draft/2020-12/schema",', + ' "type": "object",', + ' "properties": {', + ' "actions": {', + ' "type": "array",', + ' "items": {', + ' "type": "object",', + ' "properties": {', + ' "action": {', + ' "type": "string",', + ' "enum": [', + ' "navigate",', + ' "click_at",', + ' "hover_at",', + ' "type_text_at",', + ' "scroll_document",', + ' "scroll_at",', + ' "go_back",', + ' "go_forward",', + ' "key_combination",', + ' "drag_and_drop",', + ' "wait",', + ' "moondream_query",', + ' "moondream_caption",', + ' "moondream_point",', + ' "moondream_detect",', + ' "page_info",', + ' "done",', + ' "fail"', + ' ]', + ' },', + ' "args": { "type": "object" }', + ' },', + ' "required": ["action", "args"],', + ' "additionalProperties": false', + ' }', + ' },', + ' "done": { "type": "boolean" },', + ' "final_response": { "type": "string" },', + ' "error": { "type": "string" }', + ' },', + ' "required": ["actions"],', + ' "additionalProperties": false', + '}', + '', + 'Examples (valid JSON):', + '{"actions":[{"action":"navigate","args":{"url":"https://example.com"}},{"action":"moondream_caption","args":{"length":"short"}}]}', + '{"actions":[{"action":"moondream_point","args":{"object":"login button"}},{"action":"click_at","args":{"x":512,"y":412}}]}', + '{"actions":[],"done":true,"final_response":"Logged in and reached the dashboard."}', + '{"actions":[],"done":true,"final_response":"{\\"title\\":\\"Example Domain\\",\\"url\\":\\"https://example.com\\"}"}', +].join('\n'); + +export interface LlmOptions { + maxIterations?: number; + temperature?: number; + maxCompletionTokens?: number; + topP?: number; + postActionWaitMs?: number; + reasoningEffort?: 'low' | 'medium' | 'high' | string; +} + +interface LlmAction { + action: string; + args: Record; +} + +interface StepLog { + step: number; + action: string; + status: 'success' | 'failed'; + detail: string; + output?: string; +} + +export interface LlmResult { + finalResponse: string; + error?: string; +} + +export async function runLlmAgent({ + query, + moondream, + kernel, + sessionId, + groqApiKey, + options = {}, +}: { + query: string; + moondream: MoondreamClient; + kernel: Kernel; + sessionId: string; + groqApiKey: string; + options?: LlmOptions; +}): Promise { + const groq = new Groq({ apiKey: groqApiKey }); + const computer = new ComputerTool(kernel, sessionId); + + const messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }> = [ + { role: 'system', content: SYSTEM_PROMPT }, + { + role: 'user', + content: `Task: ${query}\nReturn a JSON object with an actions array. Bundle multiple actions when sensible.`, + }, + ]; + + const logs: StepLog[] = []; + const answers: string[] = []; + let lastScreenshot: string | undefined; + let lastPageUrl: string | undefined; + let lastPointNorm: { x: number; y: number } | undefined; + let error: string | undefined; + + const maxIterations = options.maxIterations ?? 40; + + for (let iteration = 1; iteration <= maxIterations; iteration++) { + let raw: string; + try { + raw = await groqCompletion(groq, messages, options); + } catch (error) { + messages.push({ + role: 'user', + content: 'Your last output was invalid. Return ONLY a JSON object that matches the schema.', + }); + try { + raw = await groqCompletion(groq, messages, options); + } catch (err) { + error = err instanceof Error ? err.message : String(err); + raw = '{"actions":[]}'; + } + } + const batchPayload = parseJsonAction(raw); + messages.push({ role: 'assistant', content: JSON.stringify(batchPayload) }); + + const actions = normalizeActions(batchPayload); + const results: Array> = []; + let doneFlag = Boolean((batchPayload as Record).done); + let finalResponse = doneFlag ? String((batchPayload as Record).final_response || '') : ''; + + try { + for (const actionItem of actions) { + const action = String(actionItem.action || '').trim(); + const args = actionItem.args || {}; + if (!action) { + results.push({ action: '', status: 'failed', detail: 'missing action' }); + continue; + } + + if (action === 'navigate') { + const url = String(args.url || '').trim(); + if (!url) throw new Error('navigate requires url'); + if (url.includes('{{') || url.includes('}}') || url.toLowerCase().includes('placeholder')) { + logs.push({ step: iteration, action, status: 'failed', detail: 'navigate url is placeholder' }); + results.push({ action, status: 'failed', detail: 'navigate url is placeholder' }); + continue; + } + const result = await computer.executeAction(ComputerAction.NAVIGATE, { url }); + const { screenshot, stateChanged } = updateScreenshotWithState(result.base64Image, lastScreenshot); + lastScreenshot = screenshot; + logs.push({ step: iteration, action, status: status(result.error), detail: `Navigated to ${url}` }); + results.push({ action, status: status(result.error), detail: `navigated to ${url}`, state_changed: stateChanged }); + await postWait(action, args, options); + } else if (action === 'click_at') { + let x: number; + let y: number; + try { + [x, y] = coerceCoords(args, computer.getScreenSize()); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + if (lastPointNorm) { + x = lastPointNorm.x; + y = lastPointNorm.y; + results.push({ action, status: 'success', detail: 'used last moondream_point', used_last_point: true }); + } else { + logs.push({ step: iteration, action, status: 'failed', detail: message }); + results.push({ action, status: 'failed', detail: message }); + continue; + } + } + const result = await computer.executeAction(ComputerAction.CLICK_AT, { x, y }); + const { screenshot, stateChanged } = updateScreenshotWithState(result.base64Image, lastScreenshot); + lastScreenshot = screenshot; + logs.push({ step: iteration, action, status: status(result.error), detail: 'Clicked at coordinates' }); + results.push({ action, status: status(result.error), detail: 'clicked', state_changed: stateChanged }); + await postWait(action, args, options); + } else if (action === 'hover_at') { + let x: number; + let y: number; + try { + [x, y] = coerceCoords(args, computer.getScreenSize()); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + logs.push({ step: iteration, action, status: 'failed', detail: message }); + results.push({ action, status: 'failed', detail: message }); + continue; + } + const result = await computer.executeAction(ComputerAction.HOVER_AT, { x, y }); + const { screenshot, stateChanged } = updateScreenshotWithState(result.base64Image, lastScreenshot); + lastScreenshot = screenshot; + logs.push({ step: iteration, action, status: status(result.error), detail: 'Hovered at coordinates' }); + results.push({ action, status: status(result.error), detail: 'hovered', state_changed: stateChanged }); + await postWait(action, args, options); + } else if (action === 'type_text_at') { + let x: number; + let y: number; + try { + [x, y] = coerceCoords(args, computer.getScreenSize()); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + if (lastPointNorm) { + x = lastPointNorm.x; + y = lastPointNorm.y; + results.push({ action, status: 'success', detail: 'used last moondream_point', used_last_point: true }); + } else { + logs.push({ step: iteration, action, status: 'failed', detail: message }); + results.push({ action, status: 'failed', detail: message }); + continue; + } + } + const text = args.text; + if (text === undefined) throw new Error('type_text_at requires text'); + const result = await computer.executeAction(ComputerAction.TYPE_TEXT_AT, { + x, + y, + text: String(text), + press_enter: Boolean(args.press_enter), + clear_before_typing: args.clear_before_typing !== false, + }); + const { screenshot, stateChanged } = updateScreenshotWithState(result.base64Image, lastScreenshot); + lastScreenshot = screenshot; + logs.push({ step: iteration, action, status: status(result.error), detail: 'Typed text' }); + results.push({ action, status: status(result.error), detail: 'typed', state_changed: stateChanged }); + await postWait(action, args, options); + } else if (action === 'scroll_document') { + const direction = String(args.direction || 'down'); + const payload: Record = { direction }; + if (args.magnitude !== undefined) payload.magnitude = Number(args.magnitude); + const result = await computer.executeAction(ComputerAction.SCROLL_DOCUMENT, payload); + const { screenshot, stateChanged } = updateScreenshotWithState(result.base64Image, lastScreenshot); + lastScreenshot = screenshot; + logs.push({ step: iteration, action, status: status(result.error), detail: `Scrolled ${direction}` }); + results.push({ action, status: status(result.error), detail: `scrolled ${direction}`, state_changed: stateChanged }); + await postWait(action, args, options); + } else if (action === 'scroll_at') { + let x: number; + let y: number; + try { + [x, y] = coerceCoords(args, computer.getScreenSize()); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + logs.push({ step: iteration, action, status: 'failed', detail: message }); + results.push({ action, status: 'failed', detail: message }); + continue; + } + const direction = String(args.direction || 'down'); + const payload: Record = { x, y, direction }; + if (args.magnitude !== undefined) payload.magnitude = Number(args.magnitude); + const result = await computer.executeAction(ComputerAction.SCROLL_AT, payload); + const { screenshot, stateChanged } = updateScreenshotWithState(result.base64Image, lastScreenshot); + lastScreenshot = screenshot; + logs.push({ step: iteration, action, status: status(result.error), detail: `Scrolled ${direction}` }); + results.push({ action, status: status(result.error), detail: `scrolled ${direction}`, state_changed: stateChanged }); + await postWait(action, args, options); + } else if (action === 'go_back') { + const result = await computer.executeAction(ComputerAction.GO_BACK, {}); + const { screenshot, stateChanged } = updateScreenshotWithState(result.base64Image, lastScreenshot); + lastScreenshot = screenshot; + logs.push({ step: iteration, action, status: status(result.error), detail: 'Went back' }); + results.push({ action, status: status(result.error), detail: 'went back', state_changed: stateChanged }); + await postWait(action, args, options); + } else if (action === 'go_forward') { + const result = await computer.executeAction(ComputerAction.GO_FORWARD, {}); + const { screenshot, stateChanged } = updateScreenshotWithState(result.base64Image, lastScreenshot); + lastScreenshot = screenshot; + logs.push({ step: iteration, action, status: status(result.error), detail: 'Went forward' }); + results.push({ action, status: status(result.error), detail: 'went forward', state_changed: stateChanged }); + await postWait(action, args, options); + } else if (action === 'key_combination') { + const keys = String(args.keys || '').trim(); + if (!keys) throw new Error('key_combination requires keys'); + const result = await computer.executeAction(ComputerAction.KEY_COMBINATION, { keys }); + const { screenshot, stateChanged } = updateScreenshotWithState(result.base64Image, lastScreenshot); + lastScreenshot = screenshot; + logs.push({ step: iteration, action, status: status(result.error), detail: `Pressed ${keys}` }); + results.push({ action, status: status(result.error), detail: `pressed ${keys}`, state_changed: stateChanged }); + await postWait(action, args, options); + } else if (action === 'wait') { + const seconds = Number(args.seconds ?? 1); + await sleep(seconds * 1000); + logs.push({ step: iteration, action, status: 'success', detail: `Waited ${seconds.toFixed(2)}s` }); + results.push({ action, status: 'success', detail: `waited ${seconds.toFixed(2)}s` }); + } else if (action === 'moondream_query') { + const question = String(args.question || '').trim(); + if (!question) { + logs.push({ step: iteration, action, status: 'failed', detail: 'Missing question' }); + results.push({ action, status: 'failed', detail: 'missing question' }); + continue; + } + const screenshot = await ensureScreenshot(computer, lastScreenshot); + lastScreenshot = screenshot; + const answer = await moondream.query(screenshot, question); + answers.push(answer); + logs.push({ step: iteration, action, status: 'success', detail: 'Answered question', output: answer }); + results.push({ action, status: 'success', answer }); + } else if (action === 'moondream_caption') { + const length = String(args.length || 'normal') as 'short' | 'normal' | 'long'; + const screenshot = await ensureScreenshot(computer, lastScreenshot); + lastScreenshot = screenshot; + const caption = await moondream.caption(screenshot, length); + answers.push(caption); + logs.push({ step: iteration, action, status: 'success', detail: 'Captioned image', output: caption }); + results.push({ action, status: 'success', caption }); + } else if (action === 'drag_and_drop') { + if (args.x === undefined || args.y === undefined) throw new Error('drag_and_drop requires x and y'); + if (args.destination_x === undefined || args.destination_y === undefined) { + throw new Error('drag_and_drop requires destination_x and destination_y'); + } + let x: number; + let y: number; + let destination_x: number; + let destination_y: number; + try { + [x, y] = coerceCoords({ x: args.x, y: args.y }, computer.getScreenSize()); + [destination_x, destination_y] = coerceCoords( + { x: args.destination_x, y: args.destination_y }, + computer.getScreenSize(), + ); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + logs.push({ step: iteration, action, status: 'failed', detail: message }); + results.push({ action, status: 'failed', detail: message }); + continue; + } + const result = await computer.executeAction(ComputerAction.DRAG_AND_DROP, { + x, + y, + destination_x, + destination_y, + }); + const { screenshot, stateChanged } = updateScreenshotWithState(result.base64Image, lastScreenshot); + lastScreenshot = screenshot; + logs.push({ step: iteration, action, status: status(result.error), detail: 'Dragged element' }); + results.push({ action, status: status(result.error), detail: 'dragged', state_changed: stateChanged }); + await postWait(action, args, options); + } else if (action === 'moondream_point') { + const objectLabel = String(args.object || '').trim(); + if (!objectLabel) throw new Error('moondream_point requires object'); + const screenshot = await ensureScreenshot(computer, lastScreenshot); + lastScreenshot = screenshot; + const point = await moondream.point(screenshot, objectLabel); + if (!point) { + logs.push({ step: iteration, action, status: 'failed', detail: 'No point found' }); + results.push({ action, status: 'failed', detail: 'no point found' }); + } else { + const payload = pointPayload(point, computer.getScreenSize()); + if (typeof payload.x_norm === 'number' && typeof payload.y_norm === 'number') { + lastPointNorm = { x: payload.x_norm, y: payload.y_norm }; + } + logs.push({ step: iteration, action, status: 'success', detail: 'Point found', output: JSON.stringify(payload) }); + results.push({ action, status: 'success', ...payload }); + } + } else if (action === 'moondream_detect') { + const objectLabel = String(args.object || '').trim(); + if (!objectLabel) throw new Error('moondream_detect requires object'); + const screenshot = await ensureScreenshot(computer, lastScreenshot); + lastScreenshot = screenshot; + const detections = await moondream.detect(screenshot, objectLabel); + const payload = detectPayload(detections, computer.getScreenSize()); + logs.push({ step: iteration, action, status: 'success', detail: 'Detection results', output: JSON.stringify(payload) }); + results.push({ action, status: 'success', ...payload }); + } else if (action === 'page_info') { + const payload = await pageInfo(computer); + const urlValue = typeof payload.url === 'string' ? payload.url : undefined; + const stateChanged = Boolean(urlValue && urlValue !== lastPageUrl); + if (urlValue) { + lastPageUrl = urlValue; + } + payload.state_changed = stateChanged; + const statusValue = payload.error ? 'failed' : 'success'; + logs.push({ step: iteration, action, status: statusValue, detail: 'Page info', output: JSON.stringify(payload) }); + results.push({ action, status: statusValue, ...payload }); + } else if (action === 'done') { + doneFlag = true; + finalResponse = String(args.final_response || ''); + break; + } else if (action === 'fail') { + error = String(args.error || 'unknown error'); + logs.push({ step: iteration, action, status: 'failed', detail: error }); + results.push({ action, status: 'failed', detail: error }); + doneFlag = true; + break; + } else { + throw new Error(`Unknown action: ${action}`); + } + } + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + logs.push({ step: iteration, action: 'batch', status: 'failed', detail: message }); + error = message; + results.push({ action: 'batch', status: 'failed', detail: message }); + } + + appendResult(messages, 'batch', { results }); + + if ( + doneFlag && + (!finalResponse || + finalResponse.includes('{{') || + finalResponse.includes('}}') || + finalResponse.toLowerCase().includes('placeholder')) + ) { + messages.push({ + role: 'user', + content: + 'final_response must be non-empty and use concrete values (no placeholders). Return a corrected JSON object.', + }); + doneFlag = false; + finalResponse = ''; + } + + if (doneFlag) { + const trimmed = finalResponse.trim(); + if (trimmed.startsWith('{')) { + try { + const repaired = jsonrepair(trimmed); + const parsed = JSON.parse(repaired); + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + throw new Error('final_response JSON is not an object'); + } + } catch { + messages.push({ + role: 'user', + content: 'final_response looks like JSON but is invalid. Return a valid JSON object string.', + }); + doneFlag = false; + finalResponse = ''; + } + } + } + + if (doneFlag) { + const urls = extractUrls(finalResponse); + if (urls.length > 0 && !lastPageUrl) { + messages.push({ + role: 'user', + content: + 'You returned a URL but did not call page_info. Call page_info on the current page before final_response.', + }); + doneFlag = false; + finalResponse = ''; + } else if (urls.length > 0 && lastPageUrl && urls.some(url => url !== lastPageUrl)) { + messages.push({ + role: 'user', + content: + 'The returned URL does not match the current page_info URL. Navigate to the correct page and then return that URL.', + }); + doneFlag = false; + finalResponse = ''; + } + } + + if (doneFlag) { + const summary = `Completed ${logs.filter(log => log.status === 'success').length}/${logs.length} steps`; + const resultPayload = { + summary, + final_response: finalResponse, + steps: logs, + answers, + }; + return { finalResponse: JSON.stringify(resultPayload, null, 2), error }; + } + } + + const summary = `Completed ${logs.filter(log => log.status === 'success').length}/${logs.length} steps`; + const resultPayload = { + summary, + final_response: '', + steps: logs, + answers, + }; + + return { finalResponse: JSON.stringify(resultPayload, null, 2), error }; +} + +async function groqCompletion( + groq: Groq, + messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }>, + options: LlmOptions, +): Promise { + const completion = await groq.chat.completions.create({ + model: MODEL_NAME, + messages, + temperature: options.temperature ?? 1.0, + max_completion_tokens: options.maxCompletionTokens ?? 65536, + top_p: options.topP ?? 1, + reasoning_effort: options.reasoningEffort ?? 'medium', + stream: false, + response_format: { type: 'json_object' }, + } as any); + return completion.choices?.[0]?.message?.content ?? ''; +} + +function parseJsonAction(raw: string): Record { + const start = raw.indexOf('{'); + const end = raw.lastIndexOf('}'); + if (start === -1 || end === -1 || end <= start) { + throw new Error('No JSON object found in LLM response'); + } + const snippet = raw.slice(start, end + 1); + const repaired = jsonrepair(snippet); + const parsed = JSON.parse(repaired); + if (!parsed || typeof parsed !== 'object') { + throw new Error('LLM JSON did not produce an object'); + } + return parsed as Record; +} + +function extractUrls(finalResponse: string): string[] { + const text = finalResponse.trim(); + if (!text.startsWith('{') || !text.endsWith('}')) return []; + try { + const repaired = jsonrepair(text); + const parsed = JSON.parse(repaired); + if (!parsed || typeof parsed !== 'object') return []; + const urls: string[] = []; + for (const [key, value] of Object.entries(parsed as Record)) { + if (key.toLowerCase().includes('url') && typeof value === 'string') { + urls.push(value); + } + } + return urls; + } catch { + return []; + } +} + +function normalizeActions(payload: Record): LlmAction[] { + if (Array.isArray(payload.actions)) { + return payload.actions + .filter(item => item && typeof item === 'object') + .map(item => item as LlmAction) + .filter(item => typeof item.action === 'string' && item.action.trim().length > 0); + } + if (payload.action && typeof payload.action === 'string') { + return [{ action: String(payload.action), args: (payload.args as Record) || {} }]; + } + return []; +} + +function appendResult( + messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }>, + action: string, + payload: unknown, +): void { + messages.push({ + role: 'user', + content: JSON.stringify({ type: 'action_result', action, output: payload }), + }); +} + +function updateScreenshot(current?: string, fallback?: string): string | undefined { + return current ?? fallback; +} + +function updateScreenshotWithState( + current?: string, + previous?: string, +): { screenshot?: string; stateChanged: boolean } { + const screenshot = updateScreenshot(current, previous); + if (!screenshot) { + return { screenshot: previous, stateChanged: false }; + } + if (!previous) { + return { screenshot, stateChanged: true }; + } + return { screenshot, stateChanged: screenshot !== previous }; +} + +function status(error?: string): 'success' | 'failed' { + return error ? 'failed' : 'success'; +} + +function coerceCoords(args: Record, screenSize: ScreenSize): [number, number] { + if (args.x === undefined || args.y === undefined) { + throw new Error('x and y are required'); + } + if (typeof args.x === 'string' && (args.x.includes('{') || args.x.includes('}'))) { + throw new Error('x must be a number, not a placeholder'); + } + if (typeof args.y === 'string' && (args.y.includes('{') || args.y.includes('}'))) { + throw new Error('y must be a number, not a placeholder'); + } + const x = Number(args.x); + const y = Number(args.y); + if (x >= 0 && x <= 1 && y >= 0 && y <= 1) { + return [Math.round(x * COORDINATE_SCALE), Math.round(y * COORDINATE_SCALE)]; + } + if (x >= 0 && x <= COORDINATE_SCALE && y >= 0 && y <= COORDINATE_SCALE) { + return [Math.round(x), Math.round(y)]; + } + const width = screenSize?.width ?? DEFAULT_SCREEN_SIZE.width; + const height = screenSize?.height ?? DEFAULT_SCREEN_SIZE.height; + return [ + Math.round((x / width) * COORDINATE_SCALE), + Math.round((y / height) * COORDINATE_SCALE), + ]; +} + +async function ensureScreenshot(computer: ComputerTool, lastScreenshot?: string): Promise { + if (lastScreenshot) return lastScreenshot; + const result = await computer.screenshot(); + if (result.error || !result.base64Image) { + throw new Error(result.error || 'Failed to capture screenshot'); + } + return result.base64Image; +} + +function pointPayload(point: MoondreamPoint, screenSize: ScreenSize): Record { + const xNorm = Math.round(point.x * COORDINATE_SCALE); + const yNorm = Math.round(point.y * COORDINATE_SCALE); + const xPx = Math.round(point.x * screenSize.width); + const yPx = Math.round(point.y * screenSize.height); + return { + x: point.x, + y: point.y, + x_norm: xNorm, + y_norm: yNorm, + x_px: xPx, + y_px: yPx, + screen: { width: screenSize.width, height: screenSize.height }, + }; +} + +function detectPayload( + detections: Array<{ x_min: number; y_min: number; x_max: number; y_max: number }>, + screenSize: ScreenSize, +): Record { + const objects = detections.map(det => ({ + x_min: det.x_min, + y_min: det.y_min, + x_max: det.x_max, + y_max: det.y_max, + x_min_norm: Math.round(det.x_min * COORDINATE_SCALE), + y_min_norm: Math.round(det.y_min * COORDINATE_SCALE), + x_max_norm: Math.round(det.x_max * COORDINATE_SCALE), + y_max_norm: Math.round(det.y_max * COORDINATE_SCALE), + x_min_px: Math.round(det.x_min * screenSize.width), + y_min_px: Math.round(det.y_min * screenSize.height), + x_max_px: Math.round(det.x_max * screenSize.width), + y_max_px: Math.round(det.y_max * screenSize.height), + })); + return { objects, screen: { width: screenSize.width, height: screenSize.height } }; +} + +async function pageInfo(computer: ComputerTool): Promise> { + try { + const { createRequire } = await import('node:module'); + const require = createRequire(import.meta.url); + let playwright: any; + try { + playwright = require('playwright-core'); + } catch { + return { error: 'playwright-core not installed' }; + } + const browserInfo = await computer.getKernel().browsers.retrieve(computer.getSessionId()); + const cdpUrl = browserInfo?.cdp_ws_url as string | undefined; + if (!cdpUrl) return { error: 'cdp url not available' }; + + const browser = await playwright.chromium.connectOverCDP(cdpUrl); + const pages: any[] = []; + for (const context of browser.contexts()) { + pages.push(...context.pages()); + } + const page = pages.length > 0 ? pages[pages.length - 1] : await browser.newPage(); + const title = await page.title(); + const url = page.url(); + await browser.close(); + return { url, title }; + } catch (error) { + return { error: error instanceof Error ? error.message : String(error) }; + } +} + +async function postWait( + action: string, + args: Record, + options: LlmOptions, +): Promise { + const waitActions = new Set([ + 'navigate', + 'click_at', + 'hover_at', + 'type_text_at', + 'scroll_document', + 'scroll_at', + 'go_back', + 'go_forward', + 'key_combination', + 'drag_and_drop', + ]); + if (!waitActions.has(action)) return; + const override = args.post_wait_ms; + const defaultWait = options.postActionWaitMs ?? 500; + const waitMs = typeof override === 'number' ? override : defaultWait; + if (waitMs > 0) { + await sleep(waitMs); + } +} + +function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} diff --git a/pkg/templates/typescript/moondream-groq-computer-use/loop.ts b/pkg/templates/typescript/moondream-groq-computer-use/loop.ts new file mode 100644 index 0000000..37f02c0 --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/loop.ts @@ -0,0 +1,342 @@ +/** + * Moondream computer-use agent loop. + */ + +import type { Kernel } from '@onkernel/sdk'; +import { ComputerTool } from './tools/computer'; +import { + ComputerAction, + COORDINATE_SCALE, + DEFAULT_SCREEN_SIZE, + type ScreenSize, +} from './tools/types/computer'; +import type { MoondreamClient, MoondreamPoint } from './moondream'; + +const URL_RE = /https?:\/\/[^\s)]+/i; + +export interface StepInput { + action: string; + url?: string; + target?: string; + text?: string; + question?: string; + direction?: 'up' | 'down' | 'left' | 'right'; + magnitude?: number; + x?: number; + y?: number; + keys?: string; + seconds?: number; + retries?: number; + retry_delay_ms?: number; + pre_wait_ms?: number; + press_enter?: boolean; + clear_before_typing?: boolean; + length?: 'short' | 'normal' | 'long'; +} + +export interface AgentOptions { + maxRetries?: number; + retryDelayMs?: number; + strict?: boolean; +} + +export interface AgentResult { + finalResponse: string; + error?: string; +} + +interface StepLog { + step: number; + action: string; + status: 'success' | 'failed'; + detail: string; + output?: string; +} + +export async function runAgent({ + query, + steps, + moondream, + kernel, + sessionId, + options, +}: { + query?: string; + steps?: StepInput[]; + moondream: MoondreamClient; + kernel: Kernel; + sessionId: string; + options: AgentOptions; +}): Promise { + const computer = new ComputerTool(kernel, sessionId); + const parsedSteps = steps?.length ? steps : parseSteps(query || ''); + + if (!parsedSteps.length) { + throw new Error('No steps could be derived from the query. Provide steps or a query.'); + } + + const logs: StepLog[] = []; + const answers: string[] = []; + let lastScreenshot: string | undefined; + let error: string | undefined; + + for (const [index, step] of parsedSteps.entries()) { + const stepNumber = index + 1; + const action = (step.action || '').trim().toLowerCase(); + + if (!action) { + logs.push({ step: stepNumber, action: 'unknown', status: 'failed', detail: 'Missing action' }); + if (options.strict) { + error = 'Missing action in step'; + break; + } + continue; + } + + try { + if (step.pre_wait_ms) { + await sleep(step.pre_wait_ms); + } + + if (action === 'open_web_browser' || action === 'open') { + const result = await computer.executeAction(ComputerAction.OPEN_WEB_BROWSER, {}); + lastScreenshot = updateScreenshot(result.base64Image, lastScreenshot); + logs.push({ step: stepNumber, action, status: status(result.error), detail: 'Opened browser' }); + } else if (action === 'navigate') { + const url = step.url || findUrl(query || ''); + if (!url) throw new Error('navigate requires url'); + const result = await computer.executeAction(ComputerAction.NAVIGATE, { url }); + lastScreenshot = updateScreenshot(result.base64Image, lastScreenshot); + logs.push({ step: stepNumber, action, status: status(result.error), detail: `Navigated to ${url}` }); + } else if (action === 'go_back') { + const result = await computer.executeAction(ComputerAction.GO_BACK, {}); + lastScreenshot = updateScreenshot(result.base64Image, lastScreenshot); + logs.push({ step: stepNumber, action, status: status(result.error), detail: 'Went back' }); + } else if (action === 'go_forward') { + const result = await computer.executeAction(ComputerAction.GO_FORWARD, {}); + lastScreenshot = updateScreenshot(result.base64Image, lastScreenshot); + logs.push({ step: stepNumber, action, status: status(result.error), detail: 'Went forward' }); + } else if (action === 'search') { + const result = await computer.executeAction(ComputerAction.SEARCH, {}); + lastScreenshot = updateScreenshot(result.base64Image, lastScreenshot); + logs.push({ step: stepNumber, action, status: status(result.error), detail: 'Focused address bar' }); + } else if (action === 'wait') { + const seconds = step.seconds ?? 1; + await sleep(seconds * 1000); + logs.push({ step: stepNumber, action, status: 'success', detail: `Waited ${seconds.toFixed(2)}s` }); + } else if (action === 'key') { + if (!step.keys) throw new Error('key action requires keys'); + const result = await computer.executeAction(ComputerAction.KEY_COMBINATION, { keys: step.keys }); + lastScreenshot = updateScreenshot(result.base64Image, lastScreenshot); + logs.push({ step: stepNumber, action, status: status(result.error), detail: `Pressed ${step.keys}` }); + } else if (action === 'scroll') { + const direction = step.direction ?? 'down'; + const magnitude = step.magnitude; + if (step.x !== undefined && step.y !== undefined) { + const [xNorm, yNorm] = normalizePoint(step.x, step.y, computer.getScreenSize()); + const args: Record = { x: xNorm, y: yNorm, direction }; + if (magnitude !== undefined) args.magnitude = magnitude; + const result = await computer.executeAction(ComputerAction.SCROLL_AT, args); + lastScreenshot = updateScreenshot(result.base64Image, lastScreenshot); + } else { + const args: Record = { direction }; + if (magnitude !== undefined) args.magnitude = magnitude; + const result = await computer.executeAction(ComputerAction.SCROLL_DOCUMENT, args); + lastScreenshot = updateScreenshot(result.base64Image, lastScreenshot); + } + logs.push({ step: stepNumber, action, status: 'success', detail: `Scrolled ${direction}` }); + } else if (action === 'click' || action === 'type') { + const target = step.target; + const retries = step.retries ?? options.maxRetries ?? 3; + const delayMs = step.retry_delay_ms ?? options.retryDelayMs ?? 1000; + + const coords = await resolveTargetCoords({ + step, + target, + moondream, + computer, + lastScreenshot, + retries, + delayMs, + }); + + if (!coords) throw new Error(`Unable to locate target: ${target}`); + + const [xNorm, yNorm] = coords; + if (action === 'click') { + const result = await computer.executeAction(ComputerAction.CLICK_AT, { x: xNorm, y: yNorm }); + lastScreenshot = updateScreenshot(result.base64Image, lastScreenshot); + logs.push({ step: stepNumber, action, status: status(result.error), detail: `Clicked ${target}` }); + } else { + if (step.text === undefined) throw new Error('type action requires text'); + const result = await computer.executeAction(ComputerAction.TYPE_TEXT_AT, { + x: xNorm, + y: yNorm, + text: String(step.text), + press_enter: Boolean(step.press_enter), + clear_before_typing: step.clear_before_typing !== false, + }); + lastScreenshot = updateScreenshot(result.base64Image, lastScreenshot); + logs.push({ step: stepNumber, action, status: status(result.error), detail: `Typed into ${target}` }); + } + } else if (action === 'query') { + const question = step.question || query; + if (!question) throw new Error('query action requires question'); + const screenshot = await ensureScreenshot(computer, lastScreenshot); + lastScreenshot = screenshot; + const answer = await moondream.query(screenshot, String(question)); + answers.push(answer); + logs.push({ step: stepNumber, action, status: 'success', detail: 'Answered question', output: answer }); + } else if (action === 'caption') { + const length = step.length ?? 'normal'; + const screenshot = await ensureScreenshot(computer, lastScreenshot); + lastScreenshot = screenshot; + const caption = await moondream.caption(screenshot, length); + answers.push(caption); + logs.push({ step: stepNumber, action, status: 'success', detail: 'Generated caption', output: caption }); + } else { + throw new Error(`Unknown action: ${action}`); + } + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + logs.push({ step: stepNumber, action, status: 'failed', detail: message }); + error = message; + if (options.strict) break; + } + } + + const summary = `Completed ${logs.filter(l => l.status === 'success').length}/${logs.length} steps`; + const resultPayload = { + summary, + steps: logs, + answers, + }; + + return { + finalResponse: JSON.stringify(resultPayload, null, 2), + error, + }; +} + +export function parseSteps(query: string): StepInput[] { + const trimmed = query.trim(); + if (!trimmed) return []; + + if (trimmed.startsWith('{') || trimmed.startsWith('[')) { + try { + const data = JSON.parse(trimmed); + if (Array.isArray(data)) return data as StepInput[]; + if (data && typeof data === 'object' && Array.isArray((data as { steps?: StepInput[] }).steps)) { + return (data as { steps: StepInput[] }).steps; + } + } catch { + // fall through + } + } + + const steps: StepInput[] = []; + const url = findUrl(trimmed); + if (url) steps.push({ action: 'navigate', url }); + + const question = stripUrlAndNavigation(trimmed); + const wantsCaption = /\bdescribe|caption\b/i.test(trimmed); + + if (wantsCaption) { + steps.push({ action: 'caption' }); + } else if (question) { + steps.push({ action: 'query', question }); + } else if (url) { + steps.push({ action: 'caption' }); + } else { + steps.push({ action: 'query', question: trimmed }); + } + + return steps; +} + +function findUrl(query: string): string | undefined { + const match = query.match(URL_RE); + return match?.[0]; +} + +function stripUrlAndNavigation(query: string): string { + let cleaned = query.replace(URL_RE, ''); + cleaned = cleaned.replace(/\b(navigate|open|go|visit)\b/gi, ''); + cleaned = cleaned.replace(/\bto\b/gi, ' '); + cleaned = cleaned.replace(/\s+/g, ' ').trim(); + return cleaned.replace(/^[,.;:-]+|[,.;:-]+$/g, ''); +} + +function normalizePoint(x: number, y: number, screenSize?: ScreenSize): [number, number] { + if (x >= 0 && x <= 1 && y >= 0 && y <= 1) { + return [Math.round(x * COORDINATE_SCALE), Math.round(y * COORDINATE_SCALE)]; + } + const width = screenSize?.width ?? DEFAULT_SCREEN_SIZE.width; + const height = screenSize?.height ?? DEFAULT_SCREEN_SIZE.height; + return [ + Math.round((x / width) * COORDINATE_SCALE), + Math.round((y / height) * COORDINATE_SCALE), + ]; +} + +function updateScreenshot(current?: string, fallback?: string): string | undefined { + return current ?? fallback; +} + +function status(error?: string): 'success' | 'failed' { + return error ? 'failed' : 'success'; +} + +async function ensureScreenshot(computer: ComputerTool, lastScreenshot?: string): Promise { + if (lastScreenshot) return lastScreenshot; + const result = await computer.screenshot(); + if (result.error || !result.base64Image) { + throw new Error(result.error || 'Failed to capture screenshot'); + } + return result.base64Image; +} + +async function resolveTargetCoords({ + step, + target, + moondream, + computer, + lastScreenshot, + retries, + delayMs, +}: { + step: StepInput; + target?: string; + moondream: MoondreamClient; + computer: ComputerTool; + lastScreenshot?: string; + retries: number; + delayMs: number; +}): Promise<[number, number] | undefined> { + if (step.x !== undefined && step.y !== undefined) { + return normalizePoint(step.x, step.y, computer.getScreenSize()); + } + + if (!target) return undefined; + + const attempts = Math.max(1, retries); + let currentScreenshot = lastScreenshot; + + for (let attempt = 0; attempt < attempts; attempt++) { + const screenshot = await ensureScreenshot(computer, currentScreenshot); + const point = await moondream.point(screenshot, String(target)); + if (point) { + return normalizePoint(point.x, point.y, computer.getScreenSize()); + } + if (attempt < attempts - 1) { + await sleep(delayMs); + currentScreenshot = undefined; + } + } + + return undefined; +} + +function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} diff --git a/pkg/templates/typescript/moondream-groq-computer-use/moondream.ts b/pkg/templates/typescript/moondream-groq-computer-use/moondream.ts new file mode 100644 index 0000000..6267fa9 --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/moondream.ts @@ -0,0 +1,135 @@ +export interface MoondreamPoint { + x: number; + y: number; +} + +export class MoondreamError extends Error {} + +export class MoondreamClient { + private apiKey: string; + private baseUrl: string; + private timeoutMs: number; + + constructor({ + apiKey, + baseUrl = 'https://api.moondream.ai/v1', + timeoutMs = 30000, + }: { + apiKey: string; + baseUrl?: string; + timeoutMs?: number; + }) { + this.apiKey = apiKey; + this.baseUrl = baseUrl.replace(/\/$/, ''); + this.timeoutMs = timeoutMs; + } + + async query(imageBase64: string, question: string, reasoning?: boolean): Promise { + const payload: Record = { + image_url: toDataUrl(imageBase64), + question, + }; + if (reasoning !== undefined) payload.reasoning = reasoning; + const data = await this.post('/query', payload); + if (typeof data.answer !== 'string') { + throw new MoondreamError('Moondream query returned an invalid response'); + } + return data.answer; + } + + async caption(imageBase64: string, length: 'short' | 'normal' | 'long' = 'normal'): Promise { + const payload = { + image_url: toDataUrl(imageBase64), + length, + stream: false, + }; + const data = await this.post('/caption', payload); + if (typeof data.caption !== 'string') { + throw new MoondreamError('Moondream caption returned an invalid response'); + } + return data.caption; + } + + async point(imageBase64: string, objectLabel: string): Promise { + const payload = { + image_url: toDataUrl(imageBase64), + object: objectLabel, + }; + const data = await this.post('/point', payload); + if (!Array.isArray(data.points) || data.points.length === 0) { + return null; + } + const point = data.points[0] as { x?: number; y?: number }; + if (typeof point?.x !== 'number' || typeof point?.y !== 'number') { + return null; + } + return { x: point.x, y: point.y }; + } + + async detect( + imageBase64: string, + objectLabel: string, + ): Promise> { + const payload = { + image_url: toDataUrl(imageBase64), + object: objectLabel, + }; + const data = await this.post('/detect', payload); + if (!Array.isArray(data.objects)) { + return []; + } + const results: Array<{ x_min: number; y_min: number; x_max: number; y_max: number }> = []; + for (const item of data.objects) { + const box = item as { x_min?: number; y_min?: number; x_max?: number; y_max?: number }; + if ([box.x_min, box.y_min, box.x_max, box.y_max].every(v => typeof v === 'number')) { + results.push({ + x_min: box.x_min as number, + y_min: box.y_min as number, + x_max: box.x_max as number, + y_max: box.y_max as number, + }); + } + } + return results; + } + + private async post(path: string, payload: Record): Promise> { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), this.timeoutMs); + + try { + const response = await fetch(`${this.baseUrl}${path}`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'X-Moondream-Auth': this.apiKey, + }, + body: JSON.stringify(payload), + signal: controller.signal, + }); + + if (!response.ok) { + const text = await response.text(); + throw new MoondreamError(`Moondream API error ${response.status}: ${text}`); + } + + const data = await response.json(); + if (!data || typeof data !== 'object') { + throw new MoondreamError('Moondream API returned unexpected response type'); + } + return data as Record; + } catch (error) { + if (error instanceof MoondreamError) throw error; + if (error instanceof Error) { + throw new MoondreamError(error.message); + } + throw new MoondreamError(String(error)); + } finally { + clearTimeout(timeout); + } + } +} + +function toDataUrl(imageBase64: string): string { + return `data:image/png;base64,${imageBase64}`; +} diff --git a/pkg/templates/typescript/moondream-groq-computer-use/package-lock.json b/pkg/templates/typescript/moondream-groq-computer-use/package-lock.json new file mode 100644 index 0000000..ee1f977 --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/package-lock.json @@ -0,0 +1,496 @@ +{ + "name": "ts-moondream-cua", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "ts-moondream-cua", + "dependencies": { + "@onkernel/sdk": "^0.23.0", + "groq-sdk": "^0.37.0", + "jsonrepair": "^3.13.2", + "playwright-core": "^1.58.1" + }, + "devDependencies": { + "@types/node": "^22.15.17", + "typescript": "^5.9.3" + } + }, + "node_modules/@onkernel/sdk": { + "version": "0.23.0", + "resolved": "https://registry.npmjs.org/@onkernel/sdk/-/sdk-0.23.0.tgz", + "integrity": "sha512-P/ez6HU8sO2QvqWATkvC+Wdv+fgto4KfBCHLl2T6EUpoU3LhgOZ/sJP2ZRf/vh5Vh7QR2Vf05RgMaFcIGBGD9Q==", + "license": "Apache-2.0" + }, + "node_modules/@types/node": { + "version": "22.19.7", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.7.tgz", + "integrity": "sha512-MciR4AKGHWl7xwxkBa6xUGxQJ4VBOmPTF7sL+iGzuahOFaO0jHCsuEfS80pan1ef4gWId1oWOweIhrDEYLuaOw==", + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/@types/node-fetch": { + "version": "2.6.13", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.13.tgz", + "integrity": "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==", + "license": "MIT", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.4" + } + }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "license": "MIT", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, + "node_modules/agentkeepalive": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.6.0.tgz", + "integrity": "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==", + "license": "MIT", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 8.0.0" + } + }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "license": "MIT" + }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/form-data": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz", + "integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==", + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/form-data-encoder": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", + "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==", + "license": "MIT" + }, + "node_modules/formdata-node": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", + "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", + "license": "MIT", + "dependencies": { + "node-domexception": "1.0.0", + "web-streams-polyfill": "4.0.0-beta.3" + }, + "engines": { + "node": ">= 12.20" + } + }, + "node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/groq-sdk": { + "version": "0.37.0", + "resolved": "https://registry.npmjs.org/groq-sdk/-/groq-sdk-0.37.0.tgz", + "integrity": "sha512-lT72pcT8b/X5XrzdKf+rWVzUGW1OQSKESmL8fFN5cTbsf02gq6oFam4SVeNtzELt9cYE2Pt3pdGgSImuTbHFDg==", + "license": "Apache-2.0", + "dependencies": { + "@types/node": "^18.11.18", + "@types/node-fetch": "^2.6.4", + "abort-controller": "^3.0.0", + "agentkeepalive": "^4.2.1", + "form-data-encoder": "1.7.2", + "formdata-node": "^4.3.2", + "node-fetch": "^2.6.7" + } + }, + "node_modules/groq-sdk/node_modules/@types/node": { + "version": "18.19.130", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.130.tgz", + "integrity": "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg==", + "license": "MIT", + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/groq-sdk/node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "license": "MIT" + }, + "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "license": "MIT", + "dependencies": { + "has-symbols": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/humanize-ms": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", + "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", + "license": "MIT", + "dependencies": { + "ms": "^2.0.0" + } + }, + "node_modules/jsonrepair": { + "version": "3.13.2", + "resolved": "https://registry.npmjs.org/jsonrepair/-/jsonrepair-3.13.2.tgz", + "integrity": "sha512-Leuly0nbM4R+S5SVJk3VHfw1oxnlEK9KygdZvfUtEtTawNDyzB4qa1xWTmFt1aeoA7sXZkVTRuIixJ8bAvqVUg==", + "license": "ISC", + "bin": { + "jsonrepair": "bin/cli.js" + } + }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "license": "MIT", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "deprecated": "Use your platform's native DOMException instead", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "engines": { + "node": ">=10.5.0" + } + }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/playwright-core": { + "version": "1.58.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.58.1.tgz", + "integrity": "sha512-bcWzOaTxcW+VOOGBCQgnaKToLJ65d6AqfLVKEWvexyS3AS6rbXl+xdpYRMGSRBClPvyj44njOWoxjNdL/H9UNg==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "license": "MIT" + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "license": "MIT" + }, + "node_modules/web-streams-polyfill": { + "version": "4.0.0-beta.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", + "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "license": "BSD-2-Clause" + }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + } + } +} diff --git a/pkg/templates/typescript/moondream-groq-computer-use/package.json b/pkg/templates/typescript/moondream-groq-computer-use/package.json new file mode 100644 index 0000000..45c3fcf --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/package.json @@ -0,0 +1,16 @@ +{ + "name": "ts-moondream-cua", + "module": "index.ts", + "type": "module", + "private": true, + "dependencies": { + "@onkernel/sdk": "^0.23.0", + "groq-sdk": "^0.37.0", + "jsonrepair": "^3.13.2", + "playwright-core": "^1.58.1" + }, + "devDependencies": { + "@types/node": "^22.15.17", + "typescript": "^5.9.3" + } +} diff --git a/pkg/templates/typescript/moondream-groq-computer-use/session.ts b/pkg/templates/typescript/moondream-groq-computer-use/session.ts new file mode 100644 index 0000000..17a6c18 --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/session.ts @@ -0,0 +1,199 @@ +/** + * Kernel Browser Session Manager. + * + * Provides a class for managing Kernel browser lifecycle + * with optional video replay recording. + */ + +import type { Kernel } from '@onkernel/sdk'; +import { DEFAULT_SCREEN_SIZE } from './tools/types/computer'; + +export interface SessionOptions { + stealth?: boolean; + timeoutSeconds?: number; + recordReplay?: boolean; + replayGracePeriod?: number; +} + +export interface SessionInfo { + sessionId: string; + liveViewUrl: string; + replayId?: string; + replayViewUrl?: string; +} + +const DEFAULT_OPTIONS: Required = { + stealth: true, + timeoutSeconds: 600, + recordReplay: false, + replayGracePeriod: 5.0, +}; + +export class KernelBrowserSession { + private kernel: Kernel; + private options: Required; + + // Session state + private _sessionId: string | null = null; + private _liveViewUrl: string | null = null; + private _replayId: string | null = null; + private _replayViewUrl: string | null = null; + + constructor(kernel: Kernel, options: SessionOptions = {}) { + this.kernel = kernel; + this.options = { ...DEFAULT_OPTIONS, ...options }; + } + + get sessionId(): string { + if (!this._sessionId) { + throw new Error('Session not started. Call start() first.'); + } + return this._sessionId; + } + + get liveViewUrl(): string | null { + return this._liveViewUrl; + } + + get replayViewUrl(): string | null { + return this._replayViewUrl; + } + + get info(): SessionInfo { + return { + sessionId: this.sessionId, + liveViewUrl: this._liveViewUrl || '', + replayId: this._replayId || undefined, + replayViewUrl: this._replayViewUrl || undefined, + }; + } + + async start(): Promise { + // Create browser with specified settings + const browser = await this.kernel.browsers.create({ + stealth: this.options.stealth, + timeout_seconds: this.options.timeoutSeconds, + viewport: { + width: DEFAULT_SCREEN_SIZE.width, + height: DEFAULT_SCREEN_SIZE.height, + }, + }); + + this._sessionId = browser.session_id; + this._liveViewUrl = browser.browser_live_view_url ?? null; + + console.log(`Kernel browser created: ${this._sessionId}`); + console.log(`Live view URL: ${this._liveViewUrl}`); + + // Start replay recording if enabled + if (this.options.recordReplay) { + try { + await this.startReplay(); + } catch (error) { + console.warn(`Warning: Failed to start replay recording: ${error}`); + console.warn('Continuing without replay recording.'); + } + } + + return this.info; + } + + private async startReplay(): Promise { + if (!this._sessionId) { + return; + } + + console.log('Starting replay recording...'); + const replay = await this.kernel.browsers.replays.start(this._sessionId); + this._replayId = replay.replay_id; + console.log(`Replay recording started: ${this._replayId}`); + } + + private async stopReplay(): Promise { + if (!this._sessionId || !this._replayId) { + return; + } + + console.log('Stopping replay recording...'); + await this.kernel.browsers.replays.stop(this._replayId, { + id: this._sessionId, + }); + console.log('Replay recording stopped. Processing video...'); + + // Wait a moment for processing + await this.sleep(2000); + + // Poll for replay to be ready (with timeout) + const maxWait = 60000; // 60 seconds + const startTime = Date.now(); + let replayReady = false; + + while (Date.now() - startTime < maxWait) { + try { + const replays = await this.kernel.browsers.replays.list(this._sessionId); + for (const replay of replays) { + if (replay.replay_id === this._replayId) { + this._replayViewUrl = replay.replay_view_url ?? null; + replayReady = true; + break; + } + } + if (replayReady) { + break; + } + } catch { + // Ignore errors while polling + } + await this.sleep(1000); + } + + if (!replayReady) { + console.log('Warning: Replay may still be processing'); + } else if (this._replayViewUrl) { + console.log(`Replay view URL: ${this._replayViewUrl}`); + } + } + + async stop(): Promise { + // Build info object directly to avoid throwing if session wasn't started + const currentSessionId = this._sessionId; + const info: SessionInfo = { + sessionId: currentSessionId || '', + liveViewUrl: this._liveViewUrl || '', + replayId: this._replayId || undefined, + replayViewUrl: this._replayViewUrl || undefined, + }; + + if (currentSessionId) { + try { + // Stop replay if recording was enabled + if (this.options.recordReplay && this._replayId) { + // Wait grace period before stopping to capture final state + if (this.options.replayGracePeriod > 0) { + console.log(`Waiting ${this.options.replayGracePeriod}s grace period...`); + await this.sleep(this.options.replayGracePeriod * 1000); + } + await this.stopReplay(); + info.replayViewUrl = this._replayViewUrl || undefined; + } + } finally { + // Always clean up the browser session, even if replay stopping fails + console.log(`Destroying browser session: ${currentSessionId}`); + await this.kernel.browsers.deleteByID(currentSessionId); + console.log('Browser session destroyed.'); + } + } + + // Reset state + this._sessionId = null; + this._liveViewUrl = null; + this._replayId = null; + this._replayViewUrl = null; + + return info; + } + + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} diff --git a/pkg/templates/typescript/moondream-groq-computer-use/tools/computer.ts b/pkg/templates/typescript/moondream-groq-computer-use/tools/computer.ts new file mode 100644 index 0000000..3aac4ed --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/tools/computer.ts @@ -0,0 +1,311 @@ +/** + * Computer Tool - Maps high-level actions to Kernel's Computer Controls API. + */ + +import { Buffer } from 'buffer'; +import type { Kernel } from '@onkernel/sdk'; +import { + ComputerAction, + PREDEFINED_COMPUTER_USE_FUNCTIONS, + DEFAULT_SCREEN_SIZE, + COORDINATE_SCALE, + type ComputerFunctionArgs, + type ToolResult, + type ScreenSize, +} from './types/computer'; + +const TYPING_DELAY_MS = 12; +const SCREENSHOT_DELAY_MS = 500; + +export class ComputerTool { + private kernel: Kernel; + private sessionId: string; + private screenSize: ScreenSize; + + constructor(kernel: Kernel, sessionId: string, screenSize: ScreenSize = DEFAULT_SCREEN_SIZE) { + this.kernel = kernel; + this.sessionId = sessionId; + this.screenSize = screenSize; + } + + getScreenSize(): ScreenSize { + return this.screenSize; + } + + getKernel(): Kernel { + return this.kernel; + } + + getSessionId(): string { + return this.sessionId; + } + + private denormalizeX(x: number): number { + return Math.round((x / COORDINATE_SCALE) * this.screenSize.width); + } + + private denormalizeY(y: number): number { + return Math.round((y / COORDINATE_SCALE) * this.screenSize.height); + } + + async screenshot(): Promise { + try { + await this.sleep(SCREENSHOT_DELAY_MS); + const response = await this.kernel.browsers.computer.captureScreenshot(this.sessionId); + const blob = await response.blob(); + const arrayBuffer = await blob.arrayBuffer(); + const buffer = Buffer.from(arrayBuffer); + const dimensions = parsePngDimensions(buffer); + if (dimensions) { + this.screenSize = dimensions; + } + + return { + base64Image: buffer.toString('base64'), + url: 'about:blank', + width: dimensions?.width, + height: dimensions?.height, + }; + } catch (error) { + return { + error: `Failed to take screenshot: ${error}`, + url: 'about:blank', + }; + } + } + + async executeAction(actionName: string, args: ComputerFunctionArgs): Promise { + if (!PREDEFINED_COMPUTER_USE_FUNCTIONS.includes(actionName as ComputerAction)) { + return { error: `Unknown action: ${actionName}` }; + } + + try { + switch (actionName) { + case ComputerAction.OPEN_WEB_BROWSER: + break; + + case ComputerAction.CLICK_AT: { + if (args.x === undefined || args.y === undefined) { + return { error: 'click_at requires x and y coordinates' }; + } + const x = this.denormalizeX(args.x); + const y = this.denormalizeY(args.y); + const numClicks = typeof args.clicks === 'number' ? args.clicks : 1; + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button: 'left', + click_type: 'click', + num_clicks: numClicks, + }); + break; + } + + case ComputerAction.HOVER_AT: { + if (args.x === undefined || args.y === undefined) { + return { error: 'hover_at requires x and y coordinates' }; + } + const x = this.denormalizeX(args.x); + const y = this.denormalizeY(args.y); + await this.kernel.browsers.computer.moveMouse(this.sessionId, { x, y }); + break; + } + + case ComputerAction.TYPE_TEXT_AT: { + if (args.x === undefined || args.y === undefined) { + return { error: 'type_text_at requires x and y coordinates' }; + } + if (!args.text) { + return { error: 'type_text_at requires text' }; + } + + const x = this.denormalizeX(args.x); + const y = this.denormalizeY(args.y); + + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button: 'left', + click_type: 'click', + num_clicks: 1, + }); + + if (args.clear_before_typing !== false) { + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['ctrl+a'], + }); + await this.sleep(50); + } + + await this.kernel.browsers.computer.typeText(this.sessionId, { + text: args.text, + delay: TYPING_DELAY_MS, + }); + + if (args.press_enter) { + await this.sleep(100); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['Return'], + }); + } + break; + } + + case ComputerAction.SCROLL_DOCUMENT: { + if (!args.direction) { + return { error: 'scroll_document requires direction' }; + } + const centerX = Math.round(this.screenSize.width / 2); + const centerY = Math.round(this.screenSize.height / 2); + const scrollDelta = 500; + + let deltaX = 0; + let deltaY = 0; + if (args.direction === 'down') deltaY = scrollDelta; + else if (args.direction === 'up') deltaY = -scrollDelta; + else if (args.direction === 'right') deltaX = scrollDelta; + else if (args.direction === 'left') deltaX = -scrollDelta; + + await this.kernel.browsers.computer.scroll(this.sessionId, { + x: centerX, + y: centerY, + delta_x: deltaX, + delta_y: deltaY, + }); + break; + } + + case ComputerAction.SCROLL_AT: { + if (args.x === undefined || args.y === undefined) { + return { error: 'scroll_at requires x and y coordinates' }; + } + if (!args.direction) { + return { error: 'scroll_at requires direction' }; + } + + const x = this.denormalizeX(args.x); + const y = this.denormalizeY(args.y); + + let magnitude = args.magnitude ?? 800; + if (args.direction === 'up' || args.direction === 'down') { + magnitude = this.denormalizeY(magnitude); + } else { + magnitude = this.denormalizeX(magnitude); + } + + let deltaX = 0; + let deltaY = 0; + if (args.direction === 'down') deltaY = magnitude; + else if (args.direction === 'up') deltaY = -magnitude; + else if (args.direction === 'right') deltaX = magnitude; + else if (args.direction === 'left') deltaX = -magnitude; + + await this.kernel.browsers.computer.scroll(this.sessionId, { + x, + y, + delta_x: deltaX, + delta_y: deltaY, + }); + break; + } + + case ComputerAction.WAIT_5_SECONDS: + await this.sleep(5000); + break; + + case ComputerAction.GO_BACK: + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['alt+Left'], + }); + await this.sleep(1000); + break; + + case ComputerAction.GO_FORWARD: + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['alt+Right'], + }); + await this.sleep(1000); + break; + + case ComputerAction.SEARCH: + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['ctrl+l'], + }); + break; + + case ComputerAction.NAVIGATE: { + if (!args.url) { + return { error: 'navigate requires url' }; + } + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['ctrl+l'], + }); + await this.sleep(100); + await this.kernel.browsers.computer.typeText(this.sessionId, { + text: args.url, + delay: TYPING_DELAY_MS, + }); + await this.sleep(100); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['Return'], + }); + await this.sleep(1500); + break; + } + + case ComputerAction.KEY_COMBINATION: { + if (!args.keys) { + return { error: 'key_combination requires keys' }; + } + const keyValue = String(args.keys).toLowerCase() === 'enter' ? 'Return' : args.keys; + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [keyValue], + }); + break; + } + + case ComputerAction.DRAG_AND_DROP: { + if (args.x === undefined || args.y === undefined || + args.destination_x === undefined || args.destination_y === undefined) { + return { error: 'drag_and_drop requires x, y, destination_x, and destination_y' }; + } + + const startX = this.denormalizeX(args.x); + const startY = this.denormalizeY(args.y); + const endX = this.denormalizeX(args.destination_x); + const endY = this.denormalizeY(args.destination_y); + + await this.kernel.browsers.computer.dragMouse(this.sessionId, { + path: [[startX, startY], [endX, endY]], + button: 'left', + }); + break; + } + + default: + return { error: `Unhandled action: ${actionName}` }; + } + + await this.sleep(SCREENSHOT_DELAY_MS); + return await this.screenshot(); + + } catch (error) { + return { error: `Action failed: ${error}`, url: 'about:blank' }; + } + } + + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} + +function parsePngDimensions(buffer: Buffer): ScreenSize | null { + if (buffer.length < 24) return null; + const signature = buffer.subarray(0, 8); + const expected = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]); + if (!signature.equals(expected)) return null; + const width = buffer.readUInt32BE(16); + const height = buffer.readUInt32BE(20); + if (!width || !height) return null; + return { width, height }; +} diff --git a/pkg/templates/typescript/moondream-groq-computer-use/tools/types/computer.ts b/pkg/templates/typescript/moondream-groq-computer-use/tools/types/computer.ts new file mode 100644 index 0000000..21b4be8 --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/tools/types/computer.ts @@ -0,0 +1,68 @@ +/** + * Type definitions for computer use actions. + */ + +export enum ComputerAction { + OPEN_WEB_BROWSER = 'open_web_browser', + CLICK_AT = 'click_at', + HOVER_AT = 'hover_at', + TYPE_TEXT_AT = 'type_text_at', + SCROLL_DOCUMENT = 'scroll_document', + SCROLL_AT = 'scroll_at', + WAIT_5_SECONDS = 'wait_5_seconds', + GO_BACK = 'go_back', + GO_FORWARD = 'go_forward', + SEARCH = 'search', + NAVIGATE = 'navigate', + KEY_COMBINATION = 'key_combination', + DRAG_AND_DROP = 'drag_and_drop', +} + +export const PREDEFINED_COMPUTER_USE_FUNCTIONS = Object.values(ComputerAction); + +export type ScrollDirection = 'up' | 'down' | 'left' | 'right'; + +export interface ComputerFunctionArgs { + x?: number; + y?: number; + clicks?: number; + + text?: string; + press_enter?: boolean; + clear_before_typing?: boolean; + + direction?: ScrollDirection; + magnitude?: number; + + url?: string; + + keys?: string; + + destination_x?: number; + destination_y?: number; + + safety_decision?: { + decision: string; + explanation: string; + }; +} + +export interface ToolResult { + base64Image?: string; + url?: string; + error?: string; + width?: number; + height?: number; +} + +export interface ScreenSize { + width: number; + height: number; +} + +export const DEFAULT_SCREEN_SIZE: ScreenSize = { + width: 1200, + height: 800, +}; + +export const COORDINATE_SCALE = 1000; diff --git a/pkg/templates/typescript/moondream-groq-computer-use/tsconfig.json b/pkg/templates/typescript/moondream-groq-computer-use/tsconfig.json new file mode 100644 index 0000000..43284c6 --- /dev/null +++ b/pkg/templates/typescript/moondream-groq-computer-use/tsconfig.json @@ -0,0 +1,23 @@ +{ + "compilerOptions": { + "lib": ["ESNext", "DOM"], + "target": "ESNext", + "module": "ESNext", + "moduleDetection": "force", + "jsx": "react-jsx", + "allowJs": true, + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "noEmit": true, + "strict": true, + "skipLibCheck": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true, + "noUnusedLocals": false, + "noUnusedParameters": false, + "noPropertyAccessFromIndexSignature": false + }, + "include": ["./**/*.ts", "./**/*.tsx"], + "exclude": ["node_modules", "dist"] +}