diff --git a/README.md b/README.md index af92648..7a721e9 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,9 @@ Commands with JSON output support: - `sample-app` - Basic template with Playwright integration - `captcha-solver` - Template demonstrating Kernel's auto-CAPTCHA solver - `stagehand` - Template with Stagehand SDK (TypeScript only) + - `ehr-system` - EHR system automation demo with Playwright/OpenAI (TypeScript only) - `browser-use` - Template with Browser Use SDK (Python only) + - `lead-scraper` - Google Maps lead scraper using Browser Use (Python only) - `anthropic-computer-use` - Anthropic Computer Use prompt loop - `openai-computer-use` - OpenAI Computer Use Agent sample - `gemini-computer-use` - Implements a Gemini computer use agent (TypeScript only) @@ -449,6 +451,12 @@ kernel create --name my-cu-app --language py --template anthropic-computer-use # Create a Claude Agent SDK app (TypeScript or Python) kernel create --name my-claude-agent --language ts --template claude-agent-sdk + +# Create a Google Maps Lead Scraper (Python) +kernel create --name my-lead-scraper --language python --template lead-scraper + +# Create an EHR System Automation (TypeScript) +kernel create --name my-ehr-bot --language ts --template ehr-system ``` ### Deploy with environment variables diff --git a/pkg/templates/python/lead-scraper/.env.example b/pkg/templates/python/lead-scraper/.env.example new file mode 100644 index 0000000..b74e0a2 --- /dev/null +++ b/pkg/templates/python/lead-scraper/.env.example @@ -0,0 +1,2 @@ +# Copy this file to .env and fill in your API key +OPENAI_API_KEY=your_openai_api_key_here diff --git a/pkg/templates/python/lead-scraper/README.md b/pkg/templates/python/lead-scraper/README.md new file mode 100644 index 0000000..e9e98b0 --- /dev/null +++ b/pkg/templates/python/lead-scraper/README.md @@ -0,0 +1,113 @@ +# Kernel Lead Scraper Template - Google Maps + +A ready-to-use lead scraper that extracts local business data from Google Maps using [browser-use](https://github.com/browser-use/browser-use) and the Kernel platform. + +## What It Does + +This template creates an AI-powered web scraper that: +1. Navigates to Google Maps +2. Searches for businesses by type and location +3. Scrolls through results to load more listings +4. Extracts structured lead data (name, phone, address, website, rating, reviews) +5. Returns clean JSON ready for your CRM or outreach tools + +## Quick Start + +### 1. Install Dependencies + +```bash +uv sync +``` + +### 2. Set Up Environment + +```bash +cp .env.example .env +# Edit .env and add your OpenAI API key +``` + +### 3. Deploy to Kernel + +```bash +kernel deploy main.py -e OPENAI_API_KEY=$OPENAI_API_KEY +``` + +### 4. Run the Scraper + +```bash +kernel run lead-scraper scrape-leads \ + --data '{"query": "restaurants", "location": "Austin, TX", "max_results": 10}' +``` + +## API Reference + +### Action: `scrape-leads` + +**Input Parameters:** + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `query` | string | ✅ | - | Business type to search (e.g., "plumbers", "gyms") | +| `location` | string | ✅ | - | Geographic location (e.g., "Miami, FL") | +| `max_results` | integer | ❌ | 20 | Maximum leads to scrape (1-50) | + +**Example Output:** + +```json +{ + "leads": [ + { + "name": "Joe's Pizza", + "phone": "(512) 555-0123", + "address": "123 Main St, Austin, TX 78701", + "website": "https://joespizza.com", + "rating": 4.5, + "review_count": 234, + "category": "Pizza restaurant" + } + ], + "total_found": 1, + "query": "pizza restaurants", + "location": "Austin, TX" +} +``` + +## Use Cases + +- **Sales Teams**: Build targeted prospect lists for cold outreach +- **Marketing Agencies**: Find local businesses needing marketing services +- **Service Providers**: Identify potential B2B clients in your area +- **Market Research**: Analyze competitor density and ratings by location + +## Customization + +### Modify the Search Prompt + +Edit the `SCRAPER_PROMPT` in `main.py` to customize what data the AI extracts: + +```python +SCRAPER_PROMPT = """ +Navigate to Google Maps and search for {query} in {location}. +# Add your custom extraction instructions here +""" +``` + +### Add New Fields + +1. Update `BusinessLead` model in `models.py` +2. Modify the prompt to extract the new fields +3. Redeploy with `kernel deploy main.py` + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| No results found | Try a broader search query or different location | +| Timeout errors | Reduce `max_results` or check your network | +| Rate limiting | Add delays between requests in production | + +## Resources + +- [Kernel Documentation](https://www.kernel.sh/docs) +- [Browser Use Docs](https://docs.browser-use.com) +- [Pydantic Models](https://docs.pydantic.dev) diff --git a/pkg/templates/python/lead-scraper/_gitignore b/pkg/templates/python/lead-scraper/_gitignore new file mode 100644 index 0000000..75475bc --- /dev/null +++ b/pkg/templates/python/lead-scraper/_gitignore @@ -0,0 +1,79 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +.project +.pydevproject +.settings/ + +# Testing +.coverage +htmlcov/ +.pytest_cache/ +.tox/ +.nox/ +coverage.xml +*.cover +.hypothesis/ + +# Logs +*.log +logs/ + +# OS +.DS_Store +Thumbs.db + +# Browser Use specific +.playwright-screenshots/ +.playwright-videos/ +.playwright-report/ +test-results/ +blob-report/ +playwright/.cache/ +playwright/.local-browsers/ + +# Lead Scraper specific +leads_output/ +*.csv +*.json + +# Misc +.cache/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.temp/ +.tmp/ diff --git a/pkg/templates/python/lead-scraper/formaters.py b/pkg/templates/python/lead-scraper/formaters.py new file mode 100644 index 0000000..60256c2 --- /dev/null +++ b/pkg/templates/python/lead-scraper/formaters.py @@ -0,0 +1,208 @@ +import json +import re +from typing import Any, Iterable +from models import BusinessLead + +_JSON_FENCE_RE = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.IGNORECASE | re.DOTALL) +_TRAILING_COMMA_RE = re.compile(r",\s*([\]}])") +_SMART_QUOTES = { + "\u201c": '"', "\u201d": '"', # “ ” + "\u2018": "'", "\u2019": "'", # ‘ ’ +} + + +def parse_leads_from_result(result_text: str) -> list[BusinessLead]: + """ + Robustly extract a JSON array of leads from an LLM/browser agent output and + convert it into BusinessLead objects. + + Strategy: + 1) Prefer JSON inside ```json ... ``` fenced blocks + 2) Else try to decode from the first '[' onwards using JSONDecoder.raw_decode + 3) Normalize a few common LLM issues (smart quotes, trailing commas, "null" strings) + """ + if not result_text or not result_text.strip(): + return [] + + candidates = _extract_json_candidates(result_text) + + for candidate in candidates: + parsed = _try_parse_json_list(candidate) + if parsed is None: + continue + + leads: list[BusinessLead] = [] + for raw in parsed: + lead = _to_business_lead(raw) + if lead is not None: + leads.append(lead) + + if leads: + return leads # first successful parse wins + + # Fallback: try to parse markdown format (when agent returns numbered lists) + leads = _parse_markdown_leads(result_text) + if leads: + return leads + + return [] + + +def _parse_markdown_leads(text: str) -> list[BusinessLead]: + """ + Parse markdown-formatted lead data when JSON parsing fails. + Handles format like: + 1. **Business Name** + - Address: 123 Main St + - Rating: 4.5 + - Phone: +1 555-1234 + """ + leads = [] + + # Pattern to match numbered entries with bold names + entry_pattern = re.compile( + r'\d+\.\s*\*\*(.+?)\*\*\s*\n((?:\s*-\s*.+\n?)+)', + re.MULTILINE + ) + + for match in entry_pattern.finditer(text): + name = match.group(1).strip() + details = match.group(2) + + # Extract fields from the dash-prefixed lines + def extract_field(pattern: str, txt: str) -> str | None: + m = re.search(pattern, txt, re.IGNORECASE) + return m.group(1).strip() if m else None + + address = extract_field(r'-\s*Address:\s*(.+?)(?:\n|$)', details) + rating_str = extract_field(r'-\s*Rating:\s*([\d.]+)', details) + review_str = extract_field(r'-\s*Review\s*Count:\s*([\d,]+)', details) + category = extract_field(r'-\s*Category:\s*(.+?)(?:\n|$)', details) + phone = extract_field(r'-\s*Phone:\s*(.+?)(?:\n|$)', details) + website = extract_field(r'-\s*Website:\s*(.+?)(?:\n|$)', details) + + # Clean up "Not available" etc + if phone and phone.lower() in ('not available', 'n/a', 'none'): + phone = None + if website and website.lower() in ('not available', 'n/a', 'none'): + website = None + + try: + lead = BusinessLead( + name=name, + address=address, + rating=float(rating_str) if rating_str else None, + review_count=int(review_str.replace(',', '')) if review_str else None, + category=category, + phone=phone, + website=website, + ) + leads.append(lead) + except Exception: + continue + + return leads + + +def _extract_json_candidates(text: str) -> list[str]: + """ + Return possible JSON snippets, ordered from most to least likely. + """ + # 1) Fenced code blocks first + fenced = [m.group(1) for m in _JSON_FENCE_RE.finditer(text)] + if fenced: + return fenced + + # 2) Otherwise try from first '[' onward (common "Return ONLY a JSON array") + idx = text.find("[") + return [text[idx:]] if idx != -1 else [] + + +def _normalize_llm_json(s: str) -> str: + # Replace smart quotes + for k, v in _SMART_QUOTES.items(): + s = s.replace(k, v) + + # Some models do ``key``: ``value``. Convert double-backticks to quotes carefully. + # (Keep this minimal: it can still be wrong, but it helps common cases.) + s = s.replace("``", '"') + + # Convert string "null" to JSON null + s = s.replace('"null"', "null") + + # Remove trailing commas before ] or } + s = _TRAILING_COMMA_RE.sub(r"\1", s) + + return s.strip() + + +def _try_parse_json_list(candidate: str) -> list[dict[str, Any]] | None: + """ + Attempt to parse a JSON array from a candidate snippet. + Returns a list of dicts or None. + """ + candidate = _normalize_llm_json(candidate) + + # 1) Direct parse + try: + data = json.loads(candidate) + return data if isinstance(data, list) else None + except json.JSONDecodeError: + pass + + # 2) Decoder-based parse from first '[' (more robust than find/rfind slicing) + start = candidate.find("[") + if start == -1: + return None + + decoder = json.JSONDecoder() + try: + obj, _end = decoder.raw_decode(candidate[start:]) + return obj if isinstance(obj, list) else None + except json.JSONDecodeError: + return None + + +def _to_business_lead(raw: Any) -> BusinessLead | None: + """ + Convert one raw object into a BusinessLead, best-effort. + """ + if not isinstance(raw, dict): + return None + + try: + # Optionally coerce some common fields + rating = raw.get("rating") + if isinstance(rating, str): + rating = _safe_float(rating) + + review_count = raw.get("review_count") + if isinstance(review_count, str): + review_count = _safe_int(review_count) + + return BusinessLead( + name=(raw.get("name") or "Unknown").strip() if isinstance(raw.get("name"), str) else (raw.get("name") or "Unknown"), + phone=raw.get("phone"), + address=raw.get("address"), + website=raw.get("website"), + rating=rating, + review_count=review_count, + category=raw.get("category"), + ) + except Exception: + # Keep parsing the rest; caller decides how to log + return None + + +def _safe_float(x: str) -> float | None: + try: + return float(x.replace(",", "").strip()) + except Exception: + return None + + +def _safe_int(x: str) -> int | None: + try: + return int(x.replace(",", "").strip()) + except Exception: + return None diff --git a/pkg/templates/python/lead-scraper/main.py b/pkg/templates/python/lead-scraper/main.py new file mode 100644 index 0000000..aa7a7eb --- /dev/null +++ b/pkg/templates/python/lead-scraper/main.py @@ -0,0 +1,170 @@ +""" +Google Maps Lead Scraper - Kernel Template + +This template demonstrates how to build a lead scraper using browser-use +to extract local business data from Google Maps. + +Usage: + kernel deploy main.py -e OPENAI_API_KEY=$OPENAI_API_KEY + kernel invoke lead-scraper scrape-leads --data '{"query": "restaurants", "location": "Austin, TX"}' +""" + +import json + +import kernel +from browser_use import Agent, Browser +from browser_use.llm import ChatOpenAI +from kernel import Kernel +from formaters import parse_leads_from_result + +from models import BusinessLead, ScrapeInput, ScrapeOutput + +# Initialize Kernel client and app +client = Kernel() +app = kernel.App("lead-scraper") + +# LLM for the browser-use agent +# API key is set via: kernel deploy main.py -e OPENAI_API_KEY=XXX +llm = ChatOpenAI(model="gpt-4.1") + +# ============================================================================ +# SCRAPER PROMPT +# Customize this prompt to change what data the agent extracts +# ============================================================================ +SCRAPER_PROMPT = """ +You are a lead generation assistant. Scrape business information from Google Maps. + +**Instructions:** +1. Navigate to https://www.google.com/maps +2. Search for: "{query} in {location}" +3. Wait for results to load +4. For each of the max {max_results} businesses in the list: + a. Click on the listing to open its detail view + b. SCROLL DOWN in the detail panel to see all info (phone/website are often below) + c. Extract: name, address, rating, review count, category, phone number, website + d. Click back or the X to close the detail view and return to the list +5. After collecting data for max {max_results} businesses, return the JSON + +**What to extract:** +- Business name (REQUIRED) +- Address (REQUIRED) +- Star rating (REQUIRED) +- Review count (optional) +- Category (optional) +- Phone number (scroll down in detail view to find it, null if not shown) +- Website URL (scroll down in detail view to find it, null if not shown) + +**Important:** +- SCROLL DOWN inside each business detail panel to find phone/website +- Use null for any field that isn't available +- Task is SUCCESSFUL when you return at least 1 complete business + +**CRITICAL - Output Format:** +You MUST return ONLY a valid JSON array. No markdown, no explanations, no numbered lists. +Return EXACTLY this format: +[ + {{"name": "Business Name", "address": "123 Main St", "rating": 4.5, "review_count": 100, "category": "Restaurant", "phone": "+1 555-1234", "website": "https://example.com"}} +] +""" + +@app.action("scrape-leads") +async def scrape_leads(ctx: kernel.KernelContext, input_data: dict) -> dict: + """ + Scrape local business leads from Google Maps. + + This action uses browser-use to navigate Google Maps, search for businesses, + and extract structured lead data. + + Args: + ctx: Kernel context containing invocation information + input_data: Dictionary with query, location, and max_results + + Returns: + ScrapeOutput containing list of leads and metadata + + Example: + kernel invoke lead-scraper scrape-leads \ + --data '{"query": "plumbers", "location": "Miami, FL", "max_results": 15}' + """ + # Validate input - default to empty dict if no payload provided + scrape_input = ScrapeInput(**(input_data or {})) + + # Use attribute access for Pydantic model (not dictionary subscript) + input_query = scrape_input.query + input_location = scrape_input.location + input_max_results = scrape_input.max_results + + # Format the prompt with user parameters + task_prompt = SCRAPER_PROMPT.format( + query=input_query, + location=input_location, + max_results=input_max_results, + ) + + print(f"Starting lead scrape: {input_query} in {input_location}") + print(f"Target: {input_max_results} leads") + + # Create Kernel browser session + kernel_browser = None + + try: + + kernel_browser = client.browsers.create( + invocation_id=ctx.invocation_id, + stealth=True, # Use stealth mode to avoid detection + ) + print(f"Browser live view: {kernel_browser.browser_live_view_url}") + + # Connect browser-use to the Kernel browser + browser = Browser( + cdp_url=kernel_browser.cdp_ws_url, + headless=False, + window_size={"width": 1920, "height": 1080}, + viewport={"width": 1920, "height": 1080}, + device_scale_factor=1.0, + ) + + # Create and run the browser-use agent + agent = Agent( + task=task_prompt, + llm=llm, + browser_session=browser, + ) + + print("Running browser-use agent...") + # Limit steps to prevent timeouts (this is a template demo) + result = await agent.run(max_steps=25) + + # Parse the result from final_result + leads = [] + final_text = result.final_result() + + if final_text: + print(f"Parsing final_result ({len(final_text)} chars)...") + leads = parse_leads_from_result(final_text) + else: + # If no final_result, check the last action for done text + print("No final_result, checking last action...") + action_results = result.action_results() + if action_results: + last_action = action_results[-1] + if hasattr(last_action, 'extracted_content') and last_action.extracted_content: + content = last_action.extracted_content + print(f"Found content in last action ({len(content)} chars)...") + leads = parse_leads_from_result(content) + + print(f"Successfully extracted {len(leads)} leads") + + output = ScrapeOutput( + leads=leads, + total_found=len(leads), + query=input_query, + location=input_location, + ) + return output.model_dump() + + finally: + # Always clean up the browsers session + if kernel_browser is not None: + client.browsers.delete_by_id(kernel_browser.session_id) + print("Browser session cleaned up") diff --git a/pkg/templates/python/lead-scraper/models.py b/pkg/templates/python/lead-scraper/models.py new file mode 100644 index 0000000..2d3c6e4 --- /dev/null +++ b/pkg/templates/python/lead-scraper/models.py @@ -0,0 +1,65 @@ +from pydantic import BaseModel, Field +from typing import Optional + + +class ScrapeInput(BaseModel): + """Input parameters for the lead scraper. + + Attributes: + query: The type of business to search (e.g., "restaurants", "plumbers", "gyms") + location: The geographic location to search (e.g., "Austin, TX", "New York, NY") + max_results: Maximum number of leads to scrape (default: 2, max: 5) + """ + + query: str = Field( + default="restaurants", + description="Type of business to search for (e.g., 'restaurants', 'plumbers')" + ) + location: str = Field( + default="New York, NY", + description="Geographic location (e.g., 'Austin, TX', 'New York, NY')" + ) + max_results: int = Field( + default=1, + ge=1, + le=5, + description="Maximum number of leads to scrape (1-5)", + ) + + +class BusinessLead(BaseModel): + """Structured data for a business lead scraped from Google Maps. + + Attributes: + name: Business name + phone: Phone number (if available) + address: Full address + website: Website URL (if available) + rating: Star rating (1-5) + review_count: Number of reviews + category: Business category/type + """ + + name: str = Field(description="Business name") + phone: Optional[str] = Field(default=None, description="Phone number") + address: Optional[str] = Field(default=None, description="Full address") + website: Optional[str] = Field(default=None, description="Website URL") + rating: Optional[float] = Field(default=None, ge=1, le=5, description="Star rating") + review_count: Optional[int] = Field(default=None, ge=0, description="Number of reviews") + category: Optional[str] = Field(default=None, description="Business category") + + +class ScrapeOutput(BaseModel): + """Output from the lead scraper. + + Attributes: + leads: List of scraped business leads + total_found: Total number of leads found + query: The original search query + location: The original search location + """ + + leads: list[BusinessLead] = Field(default_factory=list, description="List of scraped leads") + total_found: int = Field(default=0, description="Total number of leads found") + query: str = Field(description="Original search query") + location: str = Field(description="Original search location") diff --git a/pkg/templates/python/lead-scraper/pyproject.toml b/pkg/templates/python/lead-scraper/pyproject.toml new file mode 100644 index 0000000..2c33639 --- /dev/null +++ b/pkg/templates/python/lead-scraper/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "lead-scraper" +version = "0.1.0" +description = "Google Maps Lead Scraper - A Kernel template for scraping local business leads" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "browser-use>=0.11.1", + "kernel>=0.23.0", + "pydantic>=2.12.5", +] diff --git a/pkg/templates/typescript/ehr-system/.env.example b/pkg/templates/typescript/ehr-system/.env.example new file mode 100644 index 0000000..80a79e6 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/.env.example @@ -0,0 +1 @@ +ANTHROPIC_API_KEY= \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/.gitignore b/pkg/templates/typescript/ehr-system/.gitignore new file mode 100644 index 0000000..d8f3372 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/.gitignore @@ -0,0 +1,3 @@ +node_modules +.DS_Store +.env diff --git a/pkg/templates/typescript/ehr-system/README.md b/pkg/templates/typescript/ehr-system/README.md new file mode 100644 index 0000000..ae27b85 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/README.md @@ -0,0 +1,38 @@ +# EHR System Automation Template + +This template demonstrates how to use **Playwright** with **OpenAI's Computer Use** capabilities on Kernel to automate an Electronic Health Records (EHR) system workflow. + +## Logic + +The automation performs the following steps: +1. Navigate to the local OpenEMR login page (served from `openEMR/index.html` in this template). +2. Authenticate using valid credentials (any email/password works for this demo). +3. Navigate to the **Reports** section in the dashboard. +4. Click the **Export CSV** button to download the patient report. + +This template uses an agentic loop where OpenAI Vision analyzes the page and directs Playwright to interact with elements. + +## Usage + +1. **Deploy the app:** + + ```bash + kernel deploy index.ts -e OPENAI_API_KEY=$OPENAI_API_KEY + ``` + +2. **Invoke the action:** + + ```bash + kernel invoke ehr-system export-report + ``` + +3. **View logs:** + + ```bash + kernel logs ehr-system --follow + ``` + +## Requirements + +- OPENAI_API_KEY environment variable set. +- Kernel CLI installed and authenticated. diff --git a/pkg/templates/typescript/ehr-system/index.ts b/pkg/templates/typescript/ehr-system/index.ts new file mode 100644 index 0000000..43d9d82 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/index.ts @@ -0,0 +1,101 @@ +import { Kernel, type KernelContext } from '@onkernel/sdk'; +import { samplingLoop } from './loop'; +import { KernelBrowserSession } from './session'; + +interface Input { + task?: string; + record_replay?: boolean; +} + +interface Output { + elapsed: number; + result: string | null; + replay_url?: string | null; +} + +const kernel = new Kernel(); +const app = kernel.app('ehr-system'); + +// LLM API Keys are set in the environment during `kernel deploy -e ANTHROPIC_API_KEY=XXX` +// See https://www.kernel.sh/docs/launch/deploy#environment-variables +const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY; + +if (!ANTHROPIC_API_KEY) { + throw new Error('ANTHROPIC_API_KEY is not set'); +} + +const LOGIN_URL = 'https://ehr-system-six.vercel.app/login'; + +const DEFAULT_TASK = ` +Go to ${LOGIN_URL} +Login with username: Phil1 | password: phil | email: heya@invalid.email.com. +Navigate to the "Medical Reports" page. +Find the "Download Summary of Care" button and click it to download the report. +`; + +app.action( + 'export-report', + async (ctx: KernelContext, payload?: Input): Promise => { + const start = Date.now(); + const task = payload?.task || DEFAULT_TASK; + + // Create browser session with optional replay recording + const session = new KernelBrowserSession(kernel, { + stealth: true, + recordReplay: payload?.record_replay ?? false, + }); + + await session.start(); + console.log('> Kernel browser live view url:', session.liveViewUrl); + + try { + // Run the sampling loop with Anthropic Computer Use + const finalMessages = await samplingLoop({ + model: 'claude-sonnet-4-5-20250929', + messages: [{ + role: 'user', + content: `You are an automated agent. Current date and time: ${new Date().toISOString()}. You must complete the task fully without asking for permission.\n\nTask: ${task}`, + }], + apiKey: ANTHROPIC_API_KEY, + thinkingBudget: 1024, + kernel, + sessionId: session.sessionId, + }); + + // Extract the final result from the messages + if (finalMessages.length === 0) { + throw new Error('No messages were generated during the sampling loop'); + } + + const lastMessage = finalMessages[finalMessages.length - 1]; + if (!lastMessage) { + throw new Error('Failed to get the last message from the sampling loop'); + } + + const result = typeof lastMessage.content === 'string' + ? lastMessage.content + : lastMessage.content.map(block => + block.type === 'text' ? block.text : '' + ).join(''); + + const elapsed = parseFloat(((Date.now() - start) / 1000).toFixed(2)); + + // Stop session and get replay URL if recording was enabled + const sessionInfo = await session.stop(); + + return { + elapsed, + result, + replay_url: sessionInfo.replayViewUrl, + }; + } catch (error) { + const elapsed = parseFloat(((Date.now() - start) / 1000).toFixed(2)); + console.error('Error in export-report:', error); + await session.stop(); + return { + elapsed, + result: null, + }; + } + }, +); diff --git a/pkg/templates/typescript/ehr-system/loop.ts b/pkg/templates/typescript/ehr-system/loop.ts new file mode 100644 index 0000000..06e22ca --- /dev/null +++ b/pkg/templates/typescript/ehr-system/loop.ts @@ -0,0 +1,196 @@ +import { Anthropic } from '@anthropic-ai/sdk'; +import { DateTime } from 'luxon'; +import type { Kernel } from '@onkernel/sdk'; +import { DEFAULT_TOOL_VERSION, TOOL_GROUPS_BY_VERSION, ToolCollection, type ToolVersion } from './tools/collection'; +import { ComputerTool20241022, ComputerTool20250124 } from './tools/computer'; +import type { ActionParams } from './tools/types/computer'; +import { Action } from './tools/types/computer'; +import type { BetaMessageParam, BetaTextBlock } from './types/beta'; +import { injectPromptCaching, maybeFilterToNMostRecentImages, PROMPT_CACHING_BETA_FLAG, responseToParams } from './utils/message-processing'; +import { makeApiToolResult } from './utils/tool-results'; + +// System prompt optimized for the environment +const SYSTEM_PROMPT = ` +* You are utilising an Ubuntu virtual machine using ${process.arch} architecture with internet access. +* When you connect to the display, CHROMIUM IS ALREADY OPEN. The url bar is not visible but it is there. +* If you need to navigate to a new page, use ctrl+l to focus the url bar and then enter the url. +* You won't be able to see the url bar from the screenshot but ctrl-l still works. +* As the initial step click on the search bar. +* When viewing a page it can be helpful to zoom out so that you can see everything on the page. +* Either that, or make sure you scroll down to see everything before deciding something isn't available. +* When using your computer function calls, they take a while to run and send back to you. +* Where possible/feasible, try to chain multiple of these calls all into one function calls request. +* The current date is ${DateTime.now().toFormat('EEEE, MMMM d, yyyy')}. +* After each step, take a screenshot and carefully evaluate if you have achieved the right outcome. +* Explicitly show your thinking: "I have evaluated step X..." If not correct, try again. +* Only when you confirm a step was executed correctly should you move on to the next one. + + + +* When using Chromium, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". +* Instead, click on the search bar on the center of the screen where it says "Search or enter address", and enter the appropriate search term or URL there. +`; + +// Add new type definitions +interface ThinkingConfig { + type: 'enabled'; + budget_tokens: number; +} + +interface ExtraBodyConfig { + thinking?: ThinkingConfig; +} + +interface ToolUseInput extends Record { + action: Action; +} + +export async function samplingLoop({ + model, + systemPromptSuffix, + messages, + apiKey, + onlyNMostRecentImages, + maxTokens = 4096, + toolVersion, + thinkingBudget, + tokenEfficientToolsBeta = false, + kernel, + sessionId, +}: { + model: string; + systemPromptSuffix?: string; + messages: BetaMessageParam[]; + apiKey: string; + onlyNMostRecentImages?: number; + maxTokens?: number; + toolVersion?: ToolVersion; + thinkingBudget?: number; + tokenEfficientToolsBeta?: boolean; + kernel: Kernel; + sessionId: string; +}): Promise { + const selectedVersion = toolVersion || DEFAULT_TOOL_VERSION; + const toolGroup = TOOL_GROUPS_BY_VERSION[selectedVersion]; + const toolCollection = new ToolCollection(...toolGroup.tools.map((Tool: typeof ComputerTool20241022 | typeof ComputerTool20250124) => new Tool(kernel, sessionId))); + + const system: BetaTextBlock = { + type: 'text', + text: `${SYSTEM_PROMPT}${systemPromptSuffix ? ' ' + systemPromptSuffix : ''}`, + }; + + while (true) { + const betas: string[] = toolGroup.beta_flag ? [toolGroup.beta_flag] : []; + + if (tokenEfficientToolsBeta) { + betas.push('token-efficient-tools-2025-02-19'); + } + + let imageTruncationThreshold = onlyNMostRecentImages || 0; + + const client = new Anthropic({ apiKey, maxRetries: 4 }); + const enablePromptCaching = true; + + if (enablePromptCaching) { + betas.push(PROMPT_CACHING_BETA_FLAG); + injectPromptCaching(messages); + onlyNMostRecentImages = 0; + (system as BetaTextBlock).cache_control = { type: 'ephemeral' }; + } + + if (onlyNMostRecentImages) { + maybeFilterToNMostRecentImages( + messages, + onlyNMostRecentImages, + imageTruncationThreshold + ); + } + + const extraBody: ExtraBodyConfig = {}; + if (thinkingBudget) { + extraBody.thinking = { type: 'enabled', budget_tokens: thinkingBudget }; + } + + const toolParams = toolCollection.toParams(); + + const response = await client.beta.messages.create({ + max_tokens: maxTokens, + messages, + model, + system: [system], + tools: toolParams as any[], + betas, + ...extraBody, + }); + + const responseParams = responseToParams(response); + + const loggableContent = responseParams.map(block => { + if (block.type === 'tool_use') { + return { + type: 'tool_use', + name: block.name, + input: block.input + }; + } + return block; + }); + console.log('=== LLM RESPONSE ==='); + console.log('Stop reason:', response.stop_reason); + console.log(loggableContent); + console.log("===") + + messages.push({ + role: 'assistant', + content: responseParams, + }); + + if (response.stop_reason === 'end_turn') { + console.log('LLM has completed its task, ending loop'); + return messages; + } + + const toolResultContent = []; + let hasToolUse = false; + + for (const contentBlock of responseParams) { + if (contentBlock.type === 'tool_use' && contentBlock.name && contentBlock.input && typeof contentBlock.input === 'object') { + const input = contentBlock.input as ToolUseInput; + if ('action' in input && typeof input.action === 'string') { + hasToolUse = true; + const toolInput: ActionParams = { + action: input.action as Action, + ...Object.fromEntries( + Object.entries(input).filter(([key]) => key !== 'action') + ) + }; + + try { + const result = await toolCollection.run( + contentBlock.name, + toolInput + ); + + const toolResult = makeApiToolResult(result, contentBlock.id!); + toolResultContent.push(toolResult); + } catch (error) { + console.error(error); + throw error; + } + } + } + } + + if (toolResultContent.length === 0 && !hasToolUse && response.stop_reason !== 'tool_use') { + console.log('No tool use or results, and not waiting for tool use, ending loop'); + return messages; + } + + if (toolResultContent.length > 0) { + messages.push({ + role: 'user', + content: toolResultContent, + }); + } + } +} diff --git a/pkg/templates/typescript/ehr-system/package-lock.json b/pkg/templates/typescript/ehr-system/package-lock.json new file mode 100644 index 0000000..e91f864 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/package-lock.json @@ -0,0 +1,121 @@ +{ + "name": "ehr-system", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "ehr-system", + "dependencies": { + "@anthropic-ai/sdk": "^0.71.2", + "@onkernel/sdk": "^0.24.0", + "luxon": "^3.7.2" + }, + "devDependencies": { + "@types/luxon": "^3.7.1", + "@types/node": "^22.15.17", + "typescript": "^5.9.3" + } + }, + "node_modules/@anthropic-ai/sdk": { + "version": "0.71.2", + "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.71.2.tgz", + "integrity": "sha512-TGNDEUuEstk/DKu0/TflXAEt+p+p/WhTlFzEnoosvbaDU2LTjm42igSdlL0VijrKpWejtOKxX0b8A7uc+XiSAQ==", + "license": "MIT", + "dependencies": { + "json-schema-to-ts": "^3.1.1" + }, + "bin": { + "anthropic-ai-sdk": "bin/cli" + }, + "peerDependencies": { + "zod": "^3.25.0 || ^4.0.0" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, + "node_modules/@babel/runtime": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.28.6.tgz", + "integrity": "sha512-05WQkdpL9COIMz4LjTxGpPNCdlpyimKppYNoJ5Di5EUObifl8t4tuLuUBBZEpoLYOmfvIWrsp9fCl0HoPRVTdA==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@onkernel/sdk": { + "version": "0.24.0", + "resolved": "https://registry.npmjs.org/@onkernel/sdk/-/sdk-0.24.0.tgz", + "integrity": "sha512-f0xZGSaC9Nlg7CwLw6agyw682sc9Q8rPRG6Zyk82JmCKETFBdMqfyXuxK5uESidk0pQp/GYGG8rHy+vGa5jgCQ==", + "license": "Apache-2.0" + }, + "node_modules/@types/luxon": { + "version": "3.7.1", + "resolved": "https://registry.npmjs.org/@types/luxon/-/luxon-3.7.1.tgz", + "integrity": "sha512-H3iskjFIAn5SlJU7OuxUmTEpebK6TKB8rxZShDslBMZJ5u9S//KM1sbdAisiSrqwLQncVjnpi2OK2J51h+4lsg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/node": { + "version": "22.19.7", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.7.tgz", + "integrity": "sha512-MciR4AKGHWl7xwxkBa6xUGxQJ4VBOmPTF7sL+iGzuahOFaO0jHCsuEfS80pan1ef4gWId1oWOweIhrDEYLuaOw==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/json-schema-to-ts": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/json-schema-to-ts/-/json-schema-to-ts-3.1.1.tgz", + "integrity": "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==", + "license": "MIT", + "dependencies": { + "@babel/runtime": "^7.18.3", + "ts-algebra": "^2.0.0" + }, + "engines": { + "node": ">=16" + } + }, + "node_modules/luxon": { + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/luxon/-/luxon-3.7.2.tgz", + "integrity": "sha512-vtEhXh/gNjI9Yg1u4jX/0YVPMvxzHuGgCm6tC5kZyb08yjGWGnqAjGJvcXbqQR2P3MyMEFnRbpcdFS6PBcLqew==", + "license": "MIT", + "engines": { + "node": ">=12" + } + }, + "node_modules/ts-algebra": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ts-algebra/-/ts-algebra-2.0.0.tgz", + "integrity": "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==", + "license": "MIT" + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + } + } +} diff --git a/pkg/templates/typescript/ehr-system/package.json b/pkg/templates/typescript/ehr-system/package.json new file mode 100644 index 0000000..7552171 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/package.json @@ -0,0 +1,19 @@ +{ + "name": "ehr-system", + "module": "index.ts", + "type": "module", + "private": true, + "scripts": { + "build": "tsc" + }, + "dependencies": { + "@anthropic-ai/sdk": "^0.71.2", + "@onkernel/sdk": "^0.24.0", + "luxon": "^3.7.2" + }, + "devDependencies": { + "@types/luxon": "^3.7.1", + "@types/node": "^22.15.17", + "typescript": "^5.9.3" + } +} diff --git a/pkg/templates/typescript/ehr-system/session.ts b/pkg/templates/typescript/ehr-system/session.ts new file mode 100644 index 0000000..3aeb77c --- /dev/null +++ b/pkg/templates/typescript/ehr-system/session.ts @@ -0,0 +1,222 @@ +/** + * Kernel Browser Session Manager. + * + * Provides a class for managing Kernel browser lifecycle + * with optional video replay recording. + */ + +import type { Kernel } from '@onkernel/sdk'; + +export interface SessionOptions { + /** Enable stealth mode to avoid bot detection */ + stealth?: boolean; + /** Browser session timeout in seconds */ + timeoutSeconds?: number; + /** Enable replay recording (requires paid plan) */ + recordReplay?: boolean; + /** Grace period in seconds before stopping replay */ + replayGracePeriod?: number; +} + +export interface SessionInfo { + sessionId: string; + liveViewUrl: string; + replayId?: string; + replayViewUrl?: string; +} + +const DEFAULT_OPTIONS: Required = { + stealth: true, + timeoutSeconds: 300, + recordReplay: false, + replayGracePeriod: 5.0, +}; + +/** + * Manages Kernel browser lifecycle with optional replay recording. + * + * Usage: + * ```typescript + * const session = new KernelBrowserSession(kernel, options); + * await session.start(); + * try { + * // Use session.sessionId for computer controls + * } finally { + * await session.stop(); + * } + * ``` + */ +export class KernelBrowserSession { + private kernel: Kernel; + private options: Required; + + // Session state + private _sessionId: string | null = null; + private _liveViewUrl: string | null = null; + private _replayId: string | null = null; + private _replayViewUrl: string | null = null; + + constructor(kernel: Kernel, options: SessionOptions = {}) { + this.kernel = kernel; + this.options = { ...DEFAULT_OPTIONS, ...options }; + } + + get sessionId(): string { + if (!this._sessionId) { + throw new Error('Session not started. Call start() first.'); + } + return this._sessionId; + } + + get liveViewUrl(): string | null { + return this._liveViewUrl; + } + + get replayViewUrl(): string | null { + return this._replayViewUrl; + } + + get info(): SessionInfo { + return { + sessionId: this.sessionId, + liveViewUrl: this._liveViewUrl || '', + replayId: this._replayId || undefined, + replayViewUrl: this._replayViewUrl || undefined, + }; + } + + /** + * Create a Kernel browser session and optionally start recording. + */ + async start(): Promise { + // Create browser with specified settings + const browser = await this.kernel.browsers.create({ + stealth: this.options.stealth, + timeout_seconds: this.options.timeoutSeconds, + viewport: { + width: 1024, + height: 768, + refresh_rate: 60, + }, + }); + + this._sessionId = browser.session_id; + this._liveViewUrl = browser.browser_live_view_url ?? null; + + console.log(`Kernel browser created: ${this._sessionId}`); + console.log(`Live view URL: ${this._liveViewUrl}`); + + // Start replay recording if enabled + if (this.options.recordReplay) { + try { + await this.startReplay(); + } catch (error) { + console.warn(`Warning: Failed to start replay recording: ${error}`); + console.warn('Continuing without replay recording.'); + } + } + + return this.info; + } + + /** + * Start recording a replay of the browser session. + */ + private async startReplay(): Promise { + if (!this._sessionId) { + return; + } + + console.log('Starting replay recording...'); + const replay = await this.kernel.browsers.replays.start(this._sessionId); + this._replayId = replay.replay_id; + console.log(`Replay recording started: ${this._replayId}`); + } + + /** + * Stop recording and get the replay URL. + */ + private async stopReplay(): Promise { + if (!this._sessionId || !this._replayId) { + return; + } + + console.log('Stopping replay recording...'); + await this.kernel.browsers.replays.stop(this._replayId, { + id: this._sessionId, + }); + console.log('Replay recording stopped. Processing video...'); + + // Wait a moment for processing + await this.sleep(2000); + + // Poll for replay to be ready (with timeout) + const maxWait = 60000; // 60 seconds + const startTime = Date.now(); + let replayReady = false; + + while (Date.now() - startTime < maxWait) { + try { + const replays = await this.kernel.browsers.replays.list(this._sessionId); + for (const replay of replays) { + if (replay.replay_id === this._replayId) { + this._replayViewUrl = replay.replay_view_url ?? null; + replayReady = true; + break; + } + } + if (replayReady) { + break; + } + } catch { + // Ignore errors while polling + } + await this.sleep(1000); + } + + if (!replayReady) { + console.log('Warning: Replay may still be processing'); + } else if (this._replayViewUrl) { + console.log(`Replay view URL: ${this._replayViewUrl}`); + } + } + + /** + * Stop recording, and delete the browser session. + */ + async stop(): Promise { + const info = this.info; + + if (this._sessionId) { + try { + // Stop replay if recording was enabled + if (this.options.recordReplay && this._replayId) { + // Wait grace period before stopping to capture final state + if (this.options.replayGracePeriod > 0) { + console.log(`Waiting ${this.options.replayGracePeriod}s grace period...`); + await this.sleep(this.options.replayGracePeriod * 1000); + } + await this.stopReplay(); + info.replayViewUrl = this._replayViewUrl || undefined; + } + } finally { + // Always clean up the browser session, even if replay stopping fails + console.log(`Destroying browser session: ${this._sessionId}`); + await this.kernel.browsers.deleteByID(this._sessionId); + console.log('Browser session destroyed.'); + } + } + + // Reset state + this._sessionId = null; + this._liveViewUrl = null; + this._replayId = null; + this._replayViewUrl = null; + + return info; + } + + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} diff --git a/pkg/templates/typescript/ehr-system/tools/collection.ts b/pkg/templates/typescript/ehr-system/tools/collection.ts new file mode 100644 index 0000000..155352d --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tools/collection.ts @@ -0,0 +1,61 @@ +import { ComputerTool20241022, ComputerTool20250124 } from './computer'; +import { Action } from './types/computer'; +import type { ActionParams, ToolResult } from './types/computer'; + +export type ToolVersion = 'computer_use_20250124' | 'computer_use_20241022' | 'computer_use_20250429'; + +export const DEFAULT_TOOL_VERSION: ToolVersion = 'computer_use_20250429'; + +interface ToolGroup { + readonly version: ToolVersion; + readonly tools: (typeof ComputerTool20241022 | typeof ComputerTool20250124)[]; + readonly beta_flag: string; +} + +export const TOOL_GROUPS: ToolGroup[] = [ + { + version: 'computer_use_20241022', + tools: [ComputerTool20241022], + beta_flag: 'computer-use-2024-10-22', + }, + { + version: 'computer_use_20250124', + tools: [ComputerTool20250124], + beta_flag: 'computer-use-2025-01-24', + }, + // 20250429 version inherits from 20250124 + { + version: 'computer_use_20250429', + tools: [ComputerTool20250124], + beta_flag: 'computer-use-2025-01-24', + }, +]; + +export const TOOL_GROUPS_BY_VERSION: Record = Object.fromEntries( + TOOL_GROUPS.map(group => [group.version, group]) +) as Record; + +export class ToolCollection { + private tools: Map; + + constructor(...tools: (ComputerTool20241022 | ComputerTool20250124)[]) { + this.tools = new Map(tools.map(tool => [tool.name, tool])); + } + + toParams(): unknown[] { + return Array.from(this.tools.values()).map(tool => tool.toParams()); + } + + async run(name: string, toolInput: ActionParams): Promise { + const tool = this.tools.get(name); + if (!tool) { + throw new Error(`Tool ${name} not found`); + } + + if (!Object.values(Action).includes(toolInput.action)) { + throw new Error(`Invalid action ${toolInput.action} for tool ${name}`); + } + + return await tool.call(toolInput); + } +} \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/tools/computer.ts b/pkg/templates/typescript/ehr-system/tools/computer.ts new file mode 100644 index 0000000..dc0eb41 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tools/computer.ts @@ -0,0 +1,401 @@ +import { Buffer } from 'buffer'; +import type { Kernel } from '@onkernel/sdk'; +import type { BaseAnthropicTool, ToolResult, ActionParams } from './types/computer'; +import { Action, ToolError } from './types/computer'; +import { ActionValidator } from './utils/validator'; + +const TYPING_DELAY_MS = 12; + +// Type for the tool parameters sent to Anthropic API +export interface ComputerToolParams { + name: 'computer'; + type: 'computer_20241022' | 'computer_20250124'; + display_width_px: number; + display_height_px: number; + display_number: null; +} + +export class ComputerTool implements BaseAnthropicTool { + name: 'computer' = 'computer'; + protected kernel: Kernel; + protected sessionId: string; + protected _screenshotDelay = 2.0; + protected version: '20241022' | '20250124'; + + private lastMousePosition: [number, number] = [0, 0]; + + private readonly mouseActions = new Set([ + Action.LEFT_CLICK, + Action.RIGHT_CLICK, + Action.MIDDLE_CLICK, + Action.DOUBLE_CLICK, + Action.TRIPLE_CLICK, + Action.MOUSE_MOVE, + Action.LEFT_MOUSE_DOWN, + Action.LEFT_MOUSE_UP, + ]); + + private readonly keyboardActions = new Set([ + Action.KEY, + Action.TYPE, + Action.HOLD_KEY, + ]); + + private readonly systemActions = new Set([ + Action.SCREENSHOT, + Action.CURSOR_POSITION, + Action.SCROLL, + Action.WAIT, + ]); + + constructor(kernel: Kernel, sessionId: string, version: '20241022' | '20250124' = '20250124') { + this.kernel = kernel; + this.sessionId = sessionId; + this.version = version; + } + + get apiType(): 'computer_20241022' | 'computer_20250124' { + return this.version === '20241022' ? 'computer_20241022' : 'computer_20250124'; + } + + toParams(): ComputerToolParams { + const params: ComputerToolParams = { + name: this.name, + type: this.apiType, + display_width_px: 1024, + display_height_px: 768, + display_number: null, + }; + return params; + } + + private getMouseButton(action: Action): 'left' | 'right' | 'middle' { + switch (action) { + case Action.LEFT_CLICK: + case Action.DOUBLE_CLICK: + case Action.TRIPLE_CLICK: + case Action.LEFT_CLICK_DRAG: + case Action.LEFT_MOUSE_DOWN: + case Action.LEFT_MOUSE_UP: + return 'left'; + case Action.RIGHT_CLICK: + return 'right'; + case Action.MIDDLE_CLICK: + return 'middle'; + default: + throw new ToolError(`Invalid mouse action: ${action}`); + } + } + + private async handleMouseAction(action: Action, coordinate: [number, number]): Promise { + const [x, y] = ActionValidator.validateAndGetCoordinates(coordinate); + + if (action === Action.MOUSE_MOVE) { + await this.kernel.browsers.computer.moveMouse(this.sessionId, { + x, + y, + }); + this.lastMousePosition = [x, y]; + } else if (action === Action.LEFT_MOUSE_DOWN) { + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button: 'left', + click_type: 'down', + }); + this.lastMousePosition = [x, y]; + } else if (action === Action.LEFT_MOUSE_UP) { + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button: 'left', + click_type: 'up', + }); + this.lastMousePosition = [x, y]; + } else { + const button = this.getMouseButton(action); + let numClicks = 1; + if (action === Action.DOUBLE_CLICK) { + numClicks = 2; + } else if (action === Action.TRIPLE_CLICK) { + numClicks = 3; + } + + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button, + click_type: 'click', + num_clicks: numClicks, + }); + this.lastMousePosition = [x, y]; + } + + await new Promise(resolve => setTimeout(resolve, 500)); + return await this.screenshot(); + } + + private async handleKeyboardAction(action: Action, text: string, duration?: number): Promise { + if (action === Action.HOLD_KEY) { + const key = this.convertToKernelKey(text); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [key], + duration: duration ? duration * 1000 : undefined, + }); + } else if (action === Action.KEY) { + const key = this.convertKeyCombinationToKernel(text); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [key], + }); + } else { + await this.kernel.browsers.computer.typeText(this.sessionId, { + text, + delay: TYPING_DELAY_MS, + }); + } + + await new Promise(resolve => setTimeout(resolve, 500)); + return await this.screenshot(); + } + + // Key mappings for Kernel Computer Controls API (xdotool format) + private static readonly KEY_MAP: Record = { + // Enter/Return + 'return': 'Return', + 'enter': 'Return', + 'Enter': 'Return', + // Arrow keys + 'left': 'Left', + 'right': 'Right', + 'up': 'Up', + 'down': 'Down', + 'ArrowLeft': 'Left', + 'ArrowRight': 'Right', + 'ArrowUp': 'Up', + 'ArrowDown': 'Down', + // Navigation + 'home': 'Home', + 'end': 'End', + 'pageup': 'Page_Up', + 'page_up': 'Page_Up', + 'PageUp': 'Page_Up', + 'pagedown': 'Page_Down', + 'page_down': 'Page_Down', + 'PageDown': 'Page_Down', + // Editing + 'delete': 'Delete', + 'backspace': 'BackSpace', + 'Backspace': 'BackSpace', + 'tab': 'Tab', + 'insert': 'Insert', + // Escape + 'esc': 'Escape', + 'escape': 'Escape', + // Function keys + 'f1': 'F1', + 'f2': 'F2', + 'f3': 'F3', + 'f4': 'F4', + 'f5': 'F5', + 'f6': 'F6', + 'f7': 'F7', + 'f8': 'F8', + 'f9': 'F9', + 'f10': 'F10', + 'f11': 'F11', + 'f12': 'F12', + // Misc + 'space': 'space', + 'minus': 'minus', + 'equal': 'equal', + 'plus': 'plus', + }; + + // Modifier key mappings (xdotool format) + private static readonly MODIFIER_MAP: Record = { + 'ctrl': 'ctrl', + 'control': 'ctrl', + 'Control': 'ctrl', + 'alt': 'alt', + 'Alt': 'alt', + 'shift': 'shift', + 'Shift': 'shift', + 'meta': 'super', + 'Meta': 'super', + 'cmd': 'super', + 'command': 'super', + 'win': 'super', + 'super': 'super', + }; + + private convertToKernelKey(key: string): string { + // Check modifier keys first + if (ComputerTool.MODIFIER_MAP[key]) { + return ComputerTool.MODIFIER_MAP[key]; + } + // Check special keys + if (ComputerTool.KEY_MAP[key]) { + return ComputerTool.KEY_MAP[key]; + } + // Return as-is if no mapping exists + return key; + } + + private convertKeyCombinationToKernel(combo: string): string { + // Handle key combinations (e.g., "ctrl+a", "Control+t") + if (combo.includes('+')) { + const parts = combo.split('+'); + const mappedParts = parts.map(part => this.convertToKernelKey(part.trim())); + return mappedParts.join('+'); + } + // Single key - just convert it + return this.convertToKernelKey(combo); + } + + async screenshot(): Promise { + try { + console.log('Starting screenshot...'); + await new Promise(resolve => setTimeout(resolve, this._screenshotDelay * 1000)); + const response = await this.kernel.browsers.computer.captureScreenshot(this.sessionId); + const blob = await response.blob(); + const arrayBuffer = await blob.arrayBuffer(); + const buffer = Buffer.from(arrayBuffer); + console.log('Screenshot taken, size:', buffer.length, 'bytes'); + + return { + base64Image: buffer.toString('base64'), + }; + } catch (error) { + throw new ToolError(`Failed to take screenshot: ${error}`); + } + } + + async call(params: ActionParams): Promise { + const { + action, + text, + coordinate, + scrollDirection: scrollDirectionParam, + scroll_amount, + scrollAmount, + duration, + ...kwargs + } = params; + + ActionValidator.validateActionParams(params, this.mouseActions, this.keyboardActions); + + if (action === Action.SCREENSHOT) { + return await this.screenshot(); + } + + if (action === Action.CURSOR_POSITION) { + throw new ToolError('Cursor position is not available with Kernel Computer Controls API'); + } + + if (action === Action.SCROLL) { + if (this.version !== '20250124') { + throw new ToolError(`${action} is only available in version 20250124`); + } + + const scrollDirection = (scrollDirectionParam || kwargs.scroll_direction) as string | undefined; + const scrollAmountValue = scrollAmount || scroll_amount; + + if (!scrollDirection || !['up', 'down', 'left', 'right'].includes(String(scrollDirection))) { + throw new ToolError(`Scroll direction "${scrollDirection}" must be 'up', 'down', 'left', or 'right'`); + } + if (typeof scrollAmountValue !== 'number' || scrollAmountValue < 0) { + throw new ToolError(`Scroll amount "${scrollAmountValue}" must be a non-negative number`); + } + + const [x, y] = coordinate + ? ActionValidator.validateAndGetCoordinates(coordinate) + : this.lastMousePosition; + + let delta_x = 0; + let delta_y = 0; + // Each scroll_amount unit = 1 scroll wheel click ≈ 120 pixels (matches Anthropic's xdotool behavior) + const scrollDelta = (scrollAmountValue ?? 1) * 120; + + if (scrollDirection === 'down') { + delta_y = scrollDelta; + } else if (scrollDirection === 'up') { + delta_y = -scrollDelta; + } else if (scrollDirection === 'right') { + delta_x = scrollDelta; + } else if (scrollDirection === 'left') { + delta_x = -scrollDelta; + } + + await this.kernel.browsers.computer.scroll(this.sessionId, { + x, + y, + delta_x, + delta_y, + }); + + await new Promise(resolve => setTimeout(resolve, 500)); + return await this.screenshot(); + } + + if (action === Action.WAIT) { + if (this.version !== '20250124') { + throw new ToolError(`${action} is only available in version 20250124`); + } + await new Promise(resolve => setTimeout(resolve, duration! * 1000)); + return await this.screenshot(); + } + + if (action === Action.LEFT_CLICK_DRAG) { + if (!coordinate) { + throw new ToolError(`coordinate is required for ${action}`); + } + + const [endX, endY] = ActionValidator.validateAndGetCoordinates(coordinate); + const startCoordinate = kwargs.start_coordinate as [number, number] | undefined; + const [startX, startY] = startCoordinate + ? ActionValidator.validateAndGetCoordinates(startCoordinate) + : this.lastMousePosition; + + console.log(`Dragging from (${startX}, ${startY}) to (${endX}, ${endY})`); + + await this.kernel.browsers.computer.dragMouse(this.sessionId, { + path: [[startX, startY], [endX, endY]], + button: 'left', + }); + + this.lastMousePosition = [endX, endY]; + + await new Promise(resolve => setTimeout(resolve, 500)); + return await this.screenshot(); + } + + if (this.mouseActions.has(action)) { + if (!coordinate) { + throw new ToolError(`coordinate is required for ${action}`); + } + return await this.handleMouseAction(action, coordinate); + } + + if (this.keyboardActions.has(action)) { + if (!text) { + throw new ToolError(`text is required for ${action}`); + } + return await this.handleKeyboardAction(action, text, duration); + } + + throw new ToolError(`Invalid action: ${action}`); + } +} + +// For backward compatibility +export class ComputerTool20241022 extends ComputerTool { + constructor(kernel: Kernel, sessionId: string) { + super(kernel, sessionId, '20241022'); + } +} + +export class ComputerTool20250124 extends ComputerTool { + constructor(kernel: Kernel, sessionId: string) { + super(kernel, sessionId, '20250124'); + } +} diff --git a/pkg/templates/typescript/ehr-system/tools/types/computer.ts b/pkg/templates/typescript/ehr-system/tools/types/computer.ts new file mode 100644 index 0000000..d7ac72e --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tools/types/computer.ts @@ -0,0 +1,64 @@ +export enum Action { + // Mouse actions + MOUSE_MOVE = 'mouse_move', + LEFT_CLICK = 'left_click', + RIGHT_CLICK = 'right_click', + MIDDLE_CLICK = 'middle_click', + DOUBLE_CLICK = 'double_click', + TRIPLE_CLICK = 'triple_click', + LEFT_CLICK_DRAG = 'left_click_drag', + LEFT_MOUSE_DOWN = 'left_mouse_down', + LEFT_MOUSE_UP = 'left_mouse_up', + + // Keyboard actions + KEY = 'key', + TYPE = 'type', + HOLD_KEY = 'hold_key', + + // System actions + SCREENSHOT = 'screenshot', + CURSOR_POSITION = 'cursor_position', + SCROLL = 'scroll', + WAIT = 'wait', +} + +// For backward compatibility +export type Action_20241022 = Action; +export type Action_20250124 = Action; + +export type MouseButton = 'left' | 'right' | 'middle'; +export type ScrollDirection = 'up' | 'down' | 'left' | 'right'; +export type Coordinate = [number, number]; +export type Duration = number; + +export interface ActionParams { + action: Action; + text?: string; + coordinate?: Coordinate; + scrollDirection?: ScrollDirection; + scroll_amount?: number; + scrollAmount?: number; + duration?: Duration; + key?: string; + [key: string]: Action | string | Coordinate | ScrollDirection | number | Duration | undefined; +} + +export interface ToolResult { + output?: string; + error?: string; + base64Image?: string; + system?: string; +} + +export interface BaseAnthropicTool { + name: string; + apiType: string; + toParams(): unknown; +} + +export class ToolError extends Error { + constructor(message: string) { + super(message); + this.name = 'ToolError'; + } +} \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/tools/utils/keyboard.ts b/pkg/templates/typescript/ehr-system/tools/utils/keyboard.ts new file mode 100644 index 0000000..244cddf --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tools/utils/keyboard.ts @@ -0,0 +1,88 @@ +export class KeyboardUtils { + // Only map alternative names to standard Playwright modifier keys + private static readonly modifierKeyMap: Record = { + 'ctrl': 'Control', + 'alt': 'Alt', + 'cmd': 'Meta', + 'command': 'Meta', + 'win': 'Meta', + }; + + // Essential key mappings for Playwright compatibility + private static readonly keyMap: Record = { + 'return': 'Enter', + 'space': ' ', + 'left': 'ArrowLeft', + 'right': 'ArrowRight', + 'up': 'ArrowUp', + 'down': 'ArrowDown', + 'home': 'Home', + 'end': 'End', + 'pageup': 'PageUp', + 'page_up': 'PageUp', + 'pagedown': 'PageDown', + 'page_down': 'PageDown', + 'delete': 'Delete', + 'backspace': 'Backspace', + 'tab': 'Tab', + 'esc': 'Escape', + 'escape': 'Escape', + 'insert': 'Insert', + 'super_l': 'Meta', + 'f1': 'F1', + 'f2': 'F2', + 'f3': 'F3', + 'f4': 'F4', + 'f5': 'F5', + 'f6': 'F6', + 'f7': 'F7', + 'f8': 'F8', + 'f9': 'F9', + 'f10': 'F10', + 'f11': 'F11', + 'f12': 'F12', + 'minus': '-', + 'equal': '=', + 'plus': '+', + }; + + static isModifierKey(key: string | undefined): boolean { + if (!key) return false; + const normalizedKey = this.modifierKeyMap[key.toLowerCase()] || key; + return ['Control', 'Alt', 'Shift', 'Meta'].includes(normalizedKey); + } + + static getPlaywrightKey(key: string | undefined): string { + if (!key) { + throw new Error('Key cannot be undefined'); + } + + const normalizedKey = key.toLowerCase(); + + // Handle special cases + if (normalizedKey in this.keyMap) { + return this.keyMap[normalizedKey] as string; + } + + // Normalize modifier keys + if (normalizedKey in this.modifierKeyMap) { + return this.modifierKeyMap[normalizedKey] as string; + } + + // Return the key as is - Playwright handles standard key names + return key; + } + + static parseKeyCombination(combo: string): string[] { + if (!combo) { + throw new Error('Key combination cannot be empty'); + } + return combo.toLowerCase().split('+').map(key => { + const trimmedKey = key.trim(); + if (!trimmedKey) { + throw new Error('Invalid key combination: empty key'); + } + return this.getPlaywrightKey(trimmedKey); + }); + } +} \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/tools/utils/validator.ts b/pkg/templates/typescript/ehr-system/tools/utils/validator.ts new file mode 100644 index 0000000..b8522c8 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tools/utils/validator.ts @@ -0,0 +1,67 @@ +import { Action, ToolError } from '../types/computer'; +import type { ActionParams, Coordinate, Duration } from '../types/computer'; + +export class ActionValidator { + static validateText(text: string | undefined, required: boolean, action: string): void { + if (required && text === undefined) { + throw new ToolError(`text is required for ${action}`); + } + if (text !== undefined && typeof text !== 'string') { + throw new ToolError(`${text} must be a string`); + } + } + + static validateCoordinate(coordinate: Coordinate | undefined, required: boolean, action: string): void { + if (required && !coordinate) { + throw new ToolError(`coordinate is required for ${action}`); + } + if (coordinate) { + this.validateAndGetCoordinates(coordinate); + } + } + + static validateDuration(duration: Duration | undefined): void { + if (duration === undefined || typeof duration !== 'number') { + throw new ToolError(`${duration} must be a number`); + } + if (duration < 0) { + throw new ToolError(`${duration} must be non-negative`); + } + if (duration > 100) { + throw new ToolError(`${duration} is too long`); + } + } + + static validateAndGetCoordinates(coordinate: Coordinate): Coordinate { + if (!Array.isArray(coordinate) || coordinate.length !== 2) { + throw new ToolError(`${coordinate} must be a tuple of length 2`); + } + if (!coordinate.every(i => typeof i === 'number' && i >= 0)) { + throw new ToolError(`${coordinate} must be a tuple of non-negative numbers`); + } + return coordinate; + } + + static validateActionParams(params: ActionParams, mouseActions: Set, keyboardActions: Set): void { + const { action, text, coordinate, duration } = params; + + // Validate text parameter + if (keyboardActions.has(action)) { + this.validateText(text, true, action); + } else { + this.validateText(text, false, action); + } + + // Validate coordinate parameter + if (mouseActions.has(action)) { + this.validateCoordinate(coordinate, true, action); + } else { + this.validateCoordinate(coordinate, false, action); + } + + // Validate duration parameter + if (action === Action.HOLD_KEY || action === Action.WAIT) { + this.validateDuration(duration); + } + } +} \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/tsconfig.json b/pkg/templates/typescript/ehr-system/tsconfig.json new file mode 100644 index 0000000..fa10973 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "../tsconfig.base.json", + "compilerOptions": { + "outDir": "dist", + "rootDir": ".", + "lib": ["ESNext", "DOM"] + }, + "include": ["."] +} diff --git a/pkg/templates/typescript/ehr-system/types/beta.ts b/pkg/templates/typescript/ehr-system/types/beta.ts new file mode 100644 index 0000000..35328d7 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/types/beta.ts @@ -0,0 +1,58 @@ +import type { BetaMessageParam as AnthropicMessageParam, BetaMessage as AnthropicMessage, BetaContentBlock as AnthropicContentBlock } from '@anthropic-ai/sdk/resources/beta/messages/messages'; +import type { ActionParams } from '../tools/types/computer'; + +// Re-export the SDK types +export type BetaMessageParam = AnthropicMessageParam; +export type BetaMessage = AnthropicMessage; +export type BetaContentBlock = AnthropicContentBlock; + +// Keep our local types for internal use +export interface BetaTextBlock { + type: 'text'; + text: string; + id?: string; + cache_control?: { type: 'ephemeral' }; +} + +export interface BetaImageBlock { + type: 'image'; + source: { + type: 'base64'; + media_type: 'image/png'; + data: string; + }; + id?: string; + cache_control?: { type: 'ephemeral' }; +} + +export interface BetaToolUseBlock { + type: 'tool_use'; + name: string; + input: ActionParams; + id?: string; + cache_control?: { type: 'ephemeral' }; +} + +export interface BetaThinkingBlock { + type: 'thinking'; + thinking: { + type: 'enabled'; + budget_tokens: number; + } | { + type: 'disabled'; + }; + signature?: string; + id?: string; + cache_control?: { type: 'ephemeral' }; +} + +export interface BetaToolResultBlock { + type: 'tool_result'; + content: (BetaTextBlock | BetaImageBlock)[] | string; + tool_use_id: string; + is_error: boolean; + id?: string; + cache_control?: { type: 'ephemeral' }; +} + +export type BetaLocalContentBlock = BetaTextBlock | BetaImageBlock | BetaToolUseBlock | BetaThinkingBlock | BetaToolResultBlock; \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/utils/message-processing.ts b/pkg/templates/typescript/ehr-system/utils/message-processing.ts new file mode 100644 index 0000000..2595ec4 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/utils/message-processing.ts @@ -0,0 +1,79 @@ +import type { BetaMessage, BetaMessageParam, BetaToolResultBlock, BetaContentBlock, BetaLocalContentBlock } from '../types/beta'; + +export function responseToParams(response: BetaMessage): BetaContentBlock[] { + return response.content.map(block => { + if (block.type === 'text' && block.text) { + return { type: 'text', text: block.text } as BetaContentBlock; + } + if (block.type === 'thinking') { + const { thinking, signature, ...rest } = block as any; + return { ...rest, thinking, ...(signature && { signature }) } as BetaContentBlock; + } + return block as BetaContentBlock; + }); +} + +export function maybeFilterToNMostRecentImages( + messages: BetaMessageParam[], + imagesToKeep: number, + minRemovalThreshold: number +): void { + if (!imagesToKeep) return; + + const toolResultBlocks = messages + .flatMap(message => Array.isArray(message?.content) ? message.content : []) + .filter((item): item is BetaToolResultBlock => + typeof item === 'object' && item.type === 'tool_result' + ); + + const totalImages = toolResultBlocks.reduce((count, toolResult) => { + if (!Array.isArray(toolResult.content)) return count; + return count + toolResult.content.filter( + content => typeof content === 'object' && content.type === 'image' + ).length; + }, 0); + + let imagesToRemove = Math.floor((totalImages - imagesToKeep) / minRemovalThreshold) * minRemovalThreshold; + + for (const toolResult of toolResultBlocks) { + if (Array.isArray(toolResult.content)) { + toolResult.content = toolResult.content.filter(content => { + if (typeof content === 'object' && content.type === 'image') { + if (imagesToRemove > 0) { + imagesToRemove--; + return false; + } + } + return true; + }); + } + } +} + +const PROMPT_CACHING_BETA_FLAG = 'prompt-caching-2024-07-31'; + +export function injectPromptCaching(messages: BetaMessageParam[]): void { + let breakpointsRemaining = 3; + + for (let i = messages.length - 1; i >= 0; i--) { + const message = messages[i]; + if (!message) continue; + if (message.role === 'user' && Array.isArray(message.content)) { + if (breakpointsRemaining > 0) { + breakpointsRemaining--; + const lastContent = message.content[message.content.length - 1]; + if (lastContent) { + (lastContent as BetaLocalContentBlock).cache_control = { type: 'ephemeral' }; + } + } else { + const lastContent = message.content[message.content.length - 1]; + if (lastContent) { + delete (lastContent as BetaLocalContentBlock).cache_control; + } + break; + } + } + } +} + +export { PROMPT_CACHING_BETA_FLAG }; \ No newline at end of file diff --git a/pkg/templates/typescript/ehr-system/utils/tool-results.ts b/pkg/templates/typescript/ehr-system/utils/tool-results.ts new file mode 100644 index 0000000..c18eab2 --- /dev/null +++ b/pkg/templates/typescript/ehr-system/utils/tool-results.ts @@ -0,0 +1,49 @@ +import type { ToolResult } from '../tools/types/computer'; +import type { BetaToolResultBlock, BetaTextBlock, BetaImageBlock } from '../types/beta'; + +export function makeApiToolResult( + result: ToolResult, + toolUseId: string +): BetaToolResultBlock { + const toolResultContent: (BetaTextBlock | BetaImageBlock)[] = []; + let isError = false; + + if (result.error) { + isError = true; + toolResultContent.push({ + type: 'text', + text: maybePrependSystemToolResult(result, result.error), + }); + } else { + if (result.output) { + toolResultContent.push({ + type: 'text', + text: maybePrependSystemToolResult(result, result.output), + }); + } + if (result.base64Image) { + toolResultContent.push({ + type: 'image', + source: { + type: 'base64', + media_type: 'image/png', + data: result.base64Image, + }, + }); + } + } + + return { + type: 'tool_result', + content: toolResultContent, + tool_use_id: toolUseId, + is_error: isError, + }; +} + +export function maybePrependSystemToolResult(result: ToolResult, resultText: string): string { + if (result.system) { + return `${result.system}\n${resultText}`; + } + return resultText; +} \ No newline at end of file