From 7ba546f5ce85579b04f4d5a6a7c0fb7924fe0956 Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Thu, 12 Mar 2026 11:39:46 -0400 Subject: [PATCH 1/4] add --query flag for query format support --- .agents/skills/firecrawl-scrape/SKILL.md | 67 ++++++++++++++++++++++++ src/commands/scrape.ts | 13 ++++- src/index.ts | 4 ++ src/types/scrape.ts | 2 + src/utils/options.ts | 1 + 5 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 .agents/skills/firecrawl-scrape/SKILL.md diff --git a/.agents/skills/firecrawl-scrape/SKILL.md b/.agents/skills/firecrawl-scrape/SKILL.md new file mode 100644 index 0000000..4a33683 --- /dev/null +++ b/.agents/skills/firecrawl-scrape/SKILL.md @@ -0,0 +1,67 @@ +--- +name: firecrawl-scrape +description: | + Extract clean markdown from any URL, including JavaScript-rendered SPAs. Use this skill whenever the user provides a URL and wants its content, says "scrape", "grab", "fetch", "pull", "get the page", "extract from this URL", or "read this webpage". Handles JS-rendered pages, multiple concurrent URLs, and returns LLM-optimized markdown. Use this instead of WebFetch for any webpage content extraction. +allowed-tools: + - Bash(firecrawl *) + - Bash(npx firecrawl *) +--- + +# firecrawl scrape + +Scrape one or more URLs. Returns clean, LLM-optimized markdown. Multiple URLs are scraped concurrently. + +## When to use + +- You have a specific URL and want its content +- The page is static or JS-rendered (SPA) +- Step 2 in the [workflow escalation pattern](firecrawl-cli): search → **scrape** → map → crawl → browser + +## Quick start + +```bash +# Basic markdown extraction +firecrawl scrape "" -o .firecrawl/page.md + +# Main content only, no nav/footer +firecrawl scrape "" --only-main-content -o .firecrawl/page.md + +# Wait for JS to render, then scrape +firecrawl scrape "" --wait-for 3000 -o .firecrawl/page.md + +# Multiple URLs (each saved to .firecrawl/) +firecrawl scrape https://example.com https://example.com/blog https://example.com/docs + +# Get markdown and links together +firecrawl scrape "" --format markdown,links -o .firecrawl/page.json + +# Ask a question about the page +firecrawl scrape "https://example.com/pricing" --query "What is the enterprise plan price?" +``` + +## Options + +| Option | Description | +| ------------------------ | ---------------------------------------------------------------- | +| `-f, --format ` | Output formats: markdown, html, rawHtml, links, screenshot, json | +| `-Q, --query ` | Ask a question about the page content (5 credits) | +| `-H` | Include HTTP headers in output | +| `--only-main-content` | Strip nav, footer, sidebar — main content only | +| `--wait-for ` | Wait for JS rendering before scraping | +| `--include-tags ` | Only include these HTML tags | +| `--exclude-tags ` | Exclude these HTML tags | +| `-o, --output ` | Output file path | + +## Tips + +- **Try scrape before browser.** Scrape handles static pages and JS-rendered SPAs. Only escalate to browser when you need interaction (clicks, form fills, pagination). +- Multiple URLs are scraped concurrently — check `firecrawl --status` for your concurrency limit. +- Single format outputs raw content. Multiple formats (e.g., `--format markdown,links`) output JSON. +- Always quote URLs — shell interprets `?` and `&` as special characters. +- Naming convention: `.firecrawl/{site}-{path}.md` + +## See also + +- [firecrawl-search](../firecrawl-search/SKILL.md) — find pages when you don't have a URL +- [firecrawl-browser](../firecrawl-browser/SKILL.md) — when scrape can't get the content (interaction needed) +- [firecrawl-download](../firecrawl-download/SKILL.md) — bulk download an entire site to local files diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index 96b652b..7f68b5e 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -10,7 +10,7 @@ import type { ScrapeLocation, } from '../types/scrape'; import { getClient } from '../utils/client'; -import { handleScrapeOutput } from '../utils/output'; +import { handleScrapeOutput, writeOutput } from '../utils/output'; import { getOrigin } from '../utils/url'; import { executeMap } from './map'; import { getStatus } from './status'; @@ -71,6 +71,11 @@ export async function executeScrape( formats.push('screenshot'); } + // Inject query format if --query was provided + if (options.query) { + formats.push({ type: 'query', prompt: options.query } as any); + } + // If no formats specified, default to markdown if (formats.length === 0) { formats.push('markdown'); @@ -136,6 +141,12 @@ export async function handleScrapeCommand( ): Promise { const result = await executeScrape(options); + // Query mode: output answer directly + if (options.query && result.success && result.data?.answer) { + writeOutput(result.data.answer, options.output, !!options.output); + return; + } + // Determine effective formats for output handling const effectiveFormats: ScrapeFormat[] = options.formats && options.formats.length > 0 diff --git a/src/index.ts b/src/index.ts index 2ceb242..072aa29 100644 --- a/src/index.ts +++ b/src/index.ts @@ -158,6 +158,10 @@ function createScrapeCommand(): Command { '--languages ', 'Comma-separated language codes for scraping (e.g., en,es)' ) + .option( + '-Q, --query ', + 'Ask a question about the page content (query format)' + ) .action(async (positionalArgs, options) => { // Collect URLs from positional args and --url option diff --git a/src/types/scrape.ts b/src/types/scrape.ts index eaa553d..345b797 100644 --- a/src/types/scrape.ts +++ b/src/types/scrape.ts @@ -55,6 +55,8 @@ export interface ScrapeOptions { maxAge?: number; /** Location settings for geo-targeted scraping */ location?: ScrapeLocation; + /** Question to ask about the page content (query format) */ + query?: string; } export interface ScrapeResult { diff --git a/src/utils/options.ts b/src/utils/options.ts index 290c665..52a3a95 100644 --- a/src/utils/options.ts +++ b/src/utils/options.ts @@ -111,5 +111,6 @@ export function parseScrapeOptions(options: any): ScrapeOptions { timing: options.timing, maxAge: options.maxAge, location, + query: options.query, }; } From 549efd3d763297b5a49a458812d1829ce1f92c4c Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Thu, 12 Mar 2026 11:44:33 -0400 Subject: [PATCH 2/4] bump version to 1.10.0 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 0d803b9..ae708d7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "firecrawl-cli", - "version": "1.9.8", + "version": "1.10.0", "description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.", "main": "dist/index.js", "bin": { From 7becea6348a76c5530541a46c1ea11a0963851ff Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Thu, 12 Mar 2026 12:13:32 -0400 Subject: [PATCH 3/4] remove firecrawl agent command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the agent product feature (startAgent/getAgentStatus) is being cut from the CLI. core primitives (scrape, search, crawl, map, browser) stay. coding agent detection (detectCodingAgents) is unrelated and stays. removes: command, types, skill, README section, mock interface member. also removes stale skills/ index entries that were replaced by symlinks. bumps version 1.10.0 → 1.11.0. --- README.md | 68 ----- package.json | 2 +- skills/firecrawl-agent/SKILL.md | 57 ---- skills/firecrawl-browser/SKILL.md | 107 ------- skills/firecrawl-crawl/SKILL.md | 58 ---- skills/firecrawl-download/SKILL.md | 69 ----- skills/firecrawl-map/SKILL.md | 50 ---- skills/firecrawl-scrape/SKILL.md | 63 ----- skills/firecrawl-search/SKILL.md | 59 ---- src/__tests__/utils/mock-client.ts | 1 - src/commands/agent.ts | 433 ----------------------------- src/index.ts | 114 +------- src/types/agent.ts | 61 ---- 13 files changed, 2 insertions(+), 1140 deletions(-) delete mode 100644 skills/firecrawl-agent/SKILL.md delete mode 100644 skills/firecrawl-browser/SKILL.md delete mode 100644 skills/firecrawl-crawl/SKILL.md delete mode 100644 skills/firecrawl-download/SKILL.md delete mode 100644 skills/firecrawl-map/SKILL.md delete mode 100644 skills/firecrawl-scrape/SKILL.md delete mode 100644 skills/firecrawl-search/SKILL.md delete mode 100644 src/commands/agent.ts delete mode 100644 src/types/agent.ts diff --git a/README.md b/README.md index c920367..76e2bfb 100644 --- a/README.md +++ b/README.md @@ -397,74 +397,6 @@ firecrawl credit-usage --json --pretty --- -### `agent` - AI-powered web data extraction - -Run an AI agent that autonomously browses and extracts structured data from the web based on natural language prompts. - -> **Note:** Agent tasks typically take **2 to 5 minutes** to complete, and sometimes longer for complex extractions. Use sparingly and consider `--max-credits` to limit costs. - -```bash -# Basic usage (returns job ID immediately) -firecrawl agent "Find the pricing plans for Firecrawl" - -# Wait for completion -firecrawl agent "Extract all product names and prices from this store" --wait - -# Focus on specific URLs -firecrawl agent "Get the main features listed" --urls https://example.com/features - -# Use structured output with JSON schema -firecrawl agent "Extract company info" --schema '{"type":"object","properties":{"name":{"type":"string"},"employees":{"type":"number"}}}' - -# Load schema from file -firecrawl agent "Extract product data" --schema-file ./product-schema.json --wait - -# Check status of an existing job -firecrawl agent -firecrawl agent --wait -``` - -#### Agent Options - -| Option | Description | -| --------------------------- | ------------------------------------------------------------- | -| `--urls ` | Comma-separated URLs to focus extraction on | -| `--model ` | `spark-1-mini` (default, cheaper) or `spark-1-pro` (accurate) | -| `--schema ` | JSON schema for structured output (inline JSON string) | -| `--schema-file ` | Path to JSON schema file for structured output | -| `--max-credits ` | Maximum credits to spend (job fails if exceeded) | -| `--status` | Check status of existing agent job | -| `--wait` | Wait for agent to complete before returning results | -| `--poll-interval ` | Polling interval in seconds when waiting (default: 5) | -| `--timeout ` | Timeout in seconds when waiting (default: no timeout) | -| `-o, --output ` | Save output to file | -| `--json` | Output as JSON format | -| `--pretty` | Pretty print JSON output | - -#### Examples - -```bash -# Research task with timeout -firecrawl agent "Find the top 5 competitors of Notion and their pricing" --wait --timeout 300 - -# Extract data with cost limit -firecrawl agent "Get all blog post titles and dates" --urls https://blog.example.com --max-credits 100 --wait - -# Use higher accuracy model for complex extraction -firecrawl agent "Extract detailed technical specifications" --model spark-1-pro --wait --pretty - -# Save structured results to file -firecrawl agent "Extract contact information" --schema-file ./contact-schema.json --wait -o contacts.json --pretty - -# Check job status without waiting -firecrawl agent abc123-def456-... --json - -# Poll a running job until completion -firecrawl agent abc123-def456-... --wait --poll-interval 10 -``` - ---- - ### `browser` - Browser sandbox sessions (Beta) Launch and control cloud browser sessions. By default, commands are sent to agent-browser (pre-installed in every sandbox). Use `--python` or `--node` to run Playwright code directly instead. diff --git a/package.json b/package.json index ae708d7..8e5ed39 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "firecrawl-cli", - "version": "1.10.0", + "version": "1.11.0", "description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.", "main": "dist/index.js", "bin": { diff --git a/skills/firecrawl-agent/SKILL.md b/skills/firecrawl-agent/SKILL.md deleted file mode 100644 index 0b3577f..0000000 --- a/skills/firecrawl-agent/SKILL.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -name: firecrawl-agent -description: | - AI-powered autonomous data extraction that navigates complex sites and returns structured JSON. Use this skill when the user wants structured data from websites, needs to extract pricing tiers, product listings, directory entries, or any data as JSON with a schema. Triggers on "extract structured data", "get all the products", "pull pricing info", "extract as JSON", or when the user provides a JSON schema for website data. More powerful than simple scraping for multi-page structured extraction. -allowed-tools: - - Bash(firecrawl *) - - Bash(npx firecrawl *) ---- - -# firecrawl agent - -AI-powered autonomous extraction. The agent navigates sites and extracts structured data (takes 2-5 minutes). - -## When to use - -- You need structured data from complex multi-page sites -- Manual scraping would require navigating many pages -- You want the AI to figure out where the data lives - -## Quick start - -```bash -# Extract structured data -firecrawl agent "extract all pricing tiers" --wait -o .firecrawl/pricing.json - -# With a JSON schema for structured output -firecrawl agent "extract products" --schema '{"type":"object","properties":{"name":{"type":"string"},"price":{"type":"number"}}}' --wait -o .firecrawl/products.json - -# Focus on specific pages -firecrawl agent "get feature list" --urls "" --wait -o .firecrawl/features.json -``` - -## Options - -| Option | Description | -| ---------------------- | ----------------------------------------- | -| `--urls ` | Starting URLs for the agent | -| `--model ` | Model to use: spark-1-mini or spark-1-pro | -| `--schema ` | JSON schema for structured output | -| `--schema-file ` | Path to JSON schema file | -| `--max-credits ` | Credit limit for this agent run | -| `--wait` | Wait for agent to complete | -| `--pretty` | Pretty print JSON output | -| `-o, --output ` | Output file path | - -## Tips - -- Always use `--wait` to get results inline. Without it, returns a job ID. -- Use `--schema` for predictable, structured output — otherwise the agent returns freeform data. -- Agent runs consume more credits than simple scrapes. Use `--max-credits` to cap spending. -- For simple single-page extraction, prefer `scrape` — it's faster and cheaper. - -## See also - -- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — simpler single-page extraction -- [firecrawl-browser](../firecrawl-browser/SKILL.md) — manual browser automation (more control) -- [firecrawl-crawl](../firecrawl-crawl/SKILL.md) — bulk extraction without AI diff --git a/skills/firecrawl-browser/SKILL.md b/skills/firecrawl-browser/SKILL.md deleted file mode 100644 index ed0f0ae..0000000 --- a/skills/firecrawl-browser/SKILL.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -name: firecrawl-browser -description: | - Cloud browser automation for pages requiring interaction — clicks, form fills, login, pagination, infinite scroll. Use this skill when the user needs to interact with a webpage, log into a site, click buttons, fill forms, navigate multi-step flows, handle pagination, or when regular scraping fails because content requires JavaScript interaction. Triggers on "click", "fill out the form", "log in to", "paginated", "infinite scroll", "interact with the page", or "scrape failed". Provides remote Chromium sessions with persistent profiles. -allowed-tools: - - Bash(firecrawl *) - - Bash(npx firecrawl *) ---- - -# firecrawl browser - -Cloud Chromium sessions in Firecrawl's remote sandboxed environment. Interact with pages that require clicks, form fills, pagination, or login. - -## When to use - -- Content requires interaction: clicks, form fills, pagination, login -- `scrape` failed because content is behind JavaScript interaction -- You need to navigate a multi-step flow -- Last resort in the [workflow escalation pattern](firecrawl-cli): search → scrape → map → crawl → **browser** -- **Never use browser for web searches** — use `search` instead - -## Quick start - -```bash -# Typical browser workflow -firecrawl browser "open " -firecrawl browser "snapshot -i" # see interactive elements with @ref IDs -firecrawl browser "click @e5" # interact with elements -firecrawl browser "fill @e3 'search query'" # fill form fields -firecrawl browser "scrape" -o .firecrawl/page.md # extract content -firecrawl browser close -``` - -Shorthand auto-launches a session if none exists — no setup required. - -## Commands - -| Command | Description | -| -------------------- | ---------------------------------------- | -| `open ` | Navigate to a URL | -| `snapshot -i` | Get interactive elements with `@ref` IDs | -| `screenshot` | Capture a PNG screenshot | -| `click <@ref>` | Click an element by ref | -| `type <@ref> ` | Type into an element | -| `fill <@ref> ` | Fill a form field (clears first) | -| `scrape` | Extract page content as markdown | -| `scroll ` | Scroll up/down/left/right | -| `wait ` | Wait for a duration | -| `eval ` | Evaluate JavaScript on the page | - -Session management: `launch-session --ttl 600`, `list`, `close` - -## Options - -| Option | Description | -| ---------------------------- | -------------------------------------------------- | -| `--ttl ` | Session time-to-live | -| `--ttl-inactivity ` | Inactivity timeout | -| `--session ` | Use a specific session ID | -| `--profile ` | Use a named profile (persists state) | -| `--no-save-changes` | Read-only reconnect (don't write to session state) | -| `-o, --output ` | Output file path | - -## Profiles - -Profiles survive close and can be reconnected by name. Use them for login-then-work flows: - -```bash -# Session 1: Login and save state -firecrawl browser launch-session --profile my-app -firecrawl browser "open https://app.example.com/login" -firecrawl browser "snapshot -i" -firecrawl browser "fill @e3 'user@example.com'" -firecrawl browser "click @e7" -firecrawl browser "wait 2" -firecrawl browser close - -# Session 2: Come back authenticated -firecrawl browser launch-session --profile my-app -firecrawl browser "open https://app.example.com/dashboard" -firecrawl browser "scrape" -o .firecrawl/dashboard.md -firecrawl browser close -``` - -Read-only reconnect (no writes to session state): - -```bash -firecrawl browser launch-session --profile my-app --no-save-changes -``` - -Shorthand with profile: - -```bash -firecrawl browser --profile my-app "open https://example.com" -``` - -## Tips - -- If you get forbidden errors, the session may have expired — create a new one. -- For parallel browser work, launch separate sessions and operate them via `--session `. -- Always `close` sessions when done to free resources. - -## See also - -- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — try scrape first, escalate to browser only when needed -- [firecrawl-search](../firecrawl-search/SKILL.md) — for web searches (never use browser for searching) -- [firecrawl-agent](../firecrawl-agent/SKILL.md) — AI-powered extraction (less manual control) diff --git a/skills/firecrawl-crawl/SKILL.md b/skills/firecrawl-crawl/SKILL.md deleted file mode 100644 index fb2f3bd..0000000 --- a/skills/firecrawl-crawl/SKILL.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -name: firecrawl-crawl -description: | - Bulk extract content from an entire website or site section. Use this skill when the user wants to crawl a site, extract all pages from a docs section, bulk-scrape multiple pages following links, or says "crawl", "get all the pages", "extract everything under /docs", "bulk extract", or needs content from many pages on the same site. Handles depth limits, path filtering, and concurrent extraction. -allowed-tools: - - Bash(firecrawl *) - - Bash(npx firecrawl *) ---- - -# firecrawl crawl - -Bulk extract content from a website. Crawls pages following links up to a depth/limit. - -## When to use - -- You need content from many pages on a site (e.g., all `/docs/`) -- You want to extract an entire site section -- Step 4 in the [workflow escalation pattern](firecrawl-cli): search → scrape → map → **crawl** → browser - -## Quick start - -```bash -# Crawl a docs section -firecrawl crawl "" --include-paths /docs --limit 50 --wait -o .firecrawl/crawl.json - -# Full crawl with depth limit -firecrawl crawl "" --max-depth 3 --wait --progress -o .firecrawl/crawl.json - -# Check status of a running crawl -firecrawl crawl -``` - -## Options - -| Option | Description | -| ------------------------- | ------------------------------------------- | -| `--wait` | Wait for crawl to complete before returning | -| `--progress` | Show progress while waiting | -| `--limit ` | Max pages to crawl | -| `--max-depth ` | Max link depth to follow | -| `--include-paths ` | Only crawl URLs matching these paths | -| `--exclude-paths ` | Skip URLs matching these paths | -| `--delay ` | Delay between requests | -| `--max-concurrency ` | Max parallel crawl workers | -| `--pretty` | Pretty print JSON output | -| `-o, --output ` | Output file path | - -## Tips - -- Always use `--wait` when you need the results immediately. Without it, crawl returns a job ID for async polling. -- Use `--include-paths` to scope the crawl — don't crawl an entire site when you only need one section. -- Crawl consumes credits per page. Check `firecrawl credit-usage` before large crawls. - -## See also - -- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — scrape individual pages -- [firecrawl-map](../firecrawl-map/SKILL.md) — discover URLs before deciding to crawl -- [firecrawl-download](../firecrawl-download/SKILL.md) — download site to local files (uses map + scrape) diff --git a/skills/firecrawl-download/SKILL.md b/skills/firecrawl-download/SKILL.md deleted file mode 100644 index d2beeb7..0000000 --- a/skills/firecrawl-download/SKILL.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -name: firecrawl-download -description: | - Download an entire website as local files — markdown, screenshots, or multiple formats per page. Use this skill when the user wants to save a site locally, download documentation for offline use, bulk-save pages as files, or says "download the site", "save as local files", "offline copy", "download all the docs", or "save for reference". Combines site mapping and scraping into organized local directories. -allowed-tools: - - Bash(firecrawl *) - - Bash(npx firecrawl *) ---- - -# firecrawl download - -> **Experimental.** Convenience command that combines `map` + `scrape` to save an entire site as local files. - -Maps the site first to discover pages, then scrapes each one into nested directories under `.firecrawl/`. All scrape options work with download. Always pass `-y` to skip the confirmation prompt. - -## When to use - -- You want to save an entire site (or section) to local files -- You need offline access to documentation or content -- Bulk content extraction with organized file structure - -## Quick start - -```bash -# Interactive wizard (picks format, screenshots, paths for you) -firecrawl download https://docs.example.com - -# With screenshots -firecrawl download https://docs.example.com --screenshot --limit 20 -y - -# Multiple formats (each saved as its own file per page) -firecrawl download https://docs.example.com --format markdown,links --screenshot --limit 20 -y -# Creates per page: index.md + links.txt + screenshot.png - -# Filter to specific sections -firecrawl download https://docs.example.com --include-paths "/features,/sdks" - -# Skip translations -firecrawl download https://docs.example.com --exclude-paths "/zh,/ja,/fr,/es,/pt-BR" - -# Full combo -firecrawl download https://docs.example.com \ - --include-paths "/features,/sdks" \ - --exclude-paths "/zh,/ja" \ - --only-main-content \ - --screenshot \ - -y -``` - -## Download options - -| Option | Description | -| ------------------------- | -------------------------------------------------------- | -| `--limit ` | Max pages to download | -| `--search ` | Filter URLs by search query | -| `--include-paths ` | Only download matching paths | -| `--exclude-paths ` | Skip matching paths | -| `--allow-subdomains` | Include subdomain pages | -| `-y` | Skip confirmation prompt (always use in automated flows) | - -## Scrape options (all work with download) - -`-f `, `-H`, `-S`, `--screenshot`, `--full-page-screenshot`, `--only-main-content`, `--include-tags`, `--exclude-tags`, `--wait-for`, `--max-age`, `--country`, `--languages` - -## See also - -- [firecrawl-map](../firecrawl-map/SKILL.md) — just discover URLs without downloading -- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — scrape individual pages -- [firecrawl-crawl](../firecrawl-crawl/SKILL.md) — bulk extract as JSON (not local files) diff --git a/skills/firecrawl-map/SKILL.md b/skills/firecrawl-map/SKILL.md deleted file mode 100644 index f8047fc..0000000 --- a/skills/firecrawl-map/SKILL.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -name: firecrawl-map -description: | - Discover and list all URLs on a website, with optional search filtering. Use this skill when the user wants to find a specific page on a large site, list all URLs, see the site structure, find where something is on a domain, or says "map the site", "find the URL for", "what pages are on", or "list all pages". Essential when the user knows which site but not which exact page. -allowed-tools: - - Bash(firecrawl *) - - Bash(npx firecrawl *) ---- - -# firecrawl map - -Discover URLs on a site. Use `--search` to find a specific page within a large site. - -## When to use - -- You need to find a specific subpage on a large site -- You want a list of all URLs on a site before scraping or crawling -- Step 3 in the [workflow escalation pattern](firecrawl-cli): search → scrape → **map** → crawl → browser - -## Quick start - -```bash -# Find a specific page on a large site -firecrawl map "" --search "authentication" -o .firecrawl/filtered.txt - -# Get all URLs -firecrawl map "" --limit 500 --json -o .firecrawl/urls.json -``` - -## Options - -| Option | Description | -| --------------------------------- | ---------------------------- | -| `--limit ` | Max number of URLs to return | -| `--search ` | Filter URLs by search query | -| `--sitemap ` | Sitemap handling strategy | -| `--include-subdomains` | Include subdomain URLs | -| `--json` | Output as JSON | -| `-o, --output ` | Output file path | - -## Tips - -- **Map + scrape is a common pattern**: use `map --search` to find the right URL, then `scrape` it. -- Example: `map https://docs.example.com --search "auth"` → found `/docs/api/authentication` → `scrape` that URL. - -## See also - -- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — scrape the URLs you discover -- [firecrawl-crawl](../firecrawl-crawl/SKILL.md) — bulk extract instead of map + scrape -- [firecrawl-download](../firecrawl-download/SKILL.md) — download entire site (uses map internally) diff --git a/skills/firecrawl-scrape/SKILL.md b/skills/firecrawl-scrape/SKILL.md deleted file mode 100644 index a090f48..0000000 --- a/skills/firecrawl-scrape/SKILL.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -name: firecrawl-scrape -description: | - Extract clean markdown from any URL, including JavaScript-rendered SPAs. Use this skill whenever the user provides a URL and wants its content, says "scrape", "grab", "fetch", "pull", "get the page", "extract from this URL", or "read this webpage". Handles JS-rendered pages, multiple concurrent URLs, and returns LLM-optimized markdown. Use this instead of WebFetch for any webpage content extraction. -allowed-tools: - - Bash(firecrawl *) - - Bash(npx firecrawl *) ---- - -# firecrawl scrape - -Scrape one or more URLs. Returns clean, LLM-optimized markdown. Multiple URLs are scraped concurrently. - -## When to use - -- You have a specific URL and want its content -- The page is static or JS-rendered (SPA) -- Step 2 in the [workflow escalation pattern](firecrawl-cli): search → **scrape** → map → crawl → browser - -## Quick start - -```bash -# Basic markdown extraction -firecrawl scrape "" -o .firecrawl/page.md - -# Main content only, no nav/footer -firecrawl scrape "" --only-main-content -o .firecrawl/page.md - -# Wait for JS to render, then scrape -firecrawl scrape "" --wait-for 3000 -o .firecrawl/page.md - -# Multiple URLs (each saved to .firecrawl/) -firecrawl scrape https://example.com https://example.com/blog https://example.com/docs - -# Get markdown and links together -firecrawl scrape "" --format markdown,links -o .firecrawl/page.json -``` - -## Options - -| Option | Description | -| ------------------------ | ---------------------------------------------------------------- | -| `-f, --format ` | Output formats: markdown, html, rawHtml, links, screenshot, json | -| `-H` | Include HTTP headers in output | -| `--only-main-content` | Strip nav, footer, sidebar — main content only | -| `--wait-for ` | Wait for JS rendering before scraping | -| `--include-tags ` | Only include these HTML tags | -| `--exclude-tags ` | Exclude these HTML tags | -| `-o, --output ` | Output file path | - -## Tips - -- **Try scrape before browser.** Scrape handles static pages and JS-rendered SPAs. Only escalate to browser when you need interaction (clicks, form fills, pagination). -- Multiple URLs are scraped concurrently — check `firecrawl --status` for your concurrency limit. -- Single format outputs raw content. Multiple formats (e.g., `--format markdown,links`) output JSON. -- Always quote URLs — shell interprets `?` and `&` as special characters. -- Naming convention: `.firecrawl/{site}-{path}.md` - -## See also - -- [firecrawl-search](../firecrawl-search/SKILL.md) — find pages when you don't have a URL -- [firecrawl-browser](../firecrawl-browser/SKILL.md) — when scrape can't get the content (interaction needed) -- [firecrawl-download](../firecrawl-download/SKILL.md) — bulk download an entire site to local files diff --git a/skills/firecrawl-search/SKILL.md b/skills/firecrawl-search/SKILL.md deleted file mode 100644 index aec4232..0000000 --- a/skills/firecrawl-search/SKILL.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -name: firecrawl-search -description: | - Web search with full page content extraction. Use this skill whenever the user asks to search the web, find articles, research a topic, look something up, find recent news, discover sources, or says "search for", "find me", "look up", "what are people saying about", or "find articles about". Returns real search results with optional full-page markdown — not just snippets. Provides capabilities beyond Claude's built-in WebSearch. -allowed-tools: - - Bash(firecrawl *) - - Bash(npx firecrawl *) ---- - -# firecrawl search - -Web search with optional content scraping. Returns search results as JSON, optionally with full page content. - -## When to use - -- You don't have a specific URL yet -- You need to find pages, answer questions, or discover sources -- First step in the [workflow escalation pattern](firecrawl-cli): search → scrape → map → crawl → browser - -## Quick start - -```bash -# Basic search -firecrawl search "your query" -o .firecrawl/result.json --json - -# Search and scrape full page content from results -firecrawl search "your query" --scrape -o .firecrawl/scraped.json --json - -# News from the past day -firecrawl search "your query" --sources news --tbs qdr:d -o .firecrawl/news.json --json -``` - -## Options - -| Option | Description | -| ------------------------------------ | --------------------------------------------- | -| `--limit ` | Max number of results | -| `--sources ` | Source types to search | -| `--categories ` | Filter by category | -| `--tbs ` | Time-based search filter | -| `--location` | Location for search results | -| `--country ` | Country code for search | -| `--scrape` | Also scrape full page content for each result | -| `--scrape-formats` | Formats when scraping (default: markdown) | -| `-o, --output ` | Output file path | -| `--json` | Output as JSON | - -## Tips - -- **`--scrape` fetches full content** — don't re-scrape URLs from search results. This saves credits and avoids redundant fetches. -- Always write results to `.firecrawl/` with `-o` to avoid context window bloat. -- Use `jq` to extract URLs or titles: `jq -r '.data.web[].url' .firecrawl/search.json` -- Naming convention: `.firecrawl/search-{query}.json` or `.firecrawl/search-{query}-scraped.json` - -## See also - -- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — scrape a specific URL -- [firecrawl-map](../firecrawl-map/SKILL.md) — discover URLs within a site -- [firecrawl-crawl](../firecrawl-crawl/SKILL.md) — bulk extract from a site diff --git a/src/__tests__/utils/mock-client.ts b/src/__tests__/utils/mock-client.ts index 6165d6e..96c8bee 100644 --- a/src/__tests__/utils/mock-client.ts +++ b/src/__tests__/utils/mock-client.ts @@ -14,7 +14,6 @@ export interface MockFirecrawlClient { crawl?: any; map?: any; extract?: any; - agent?: any; } /** diff --git a/src/commands/agent.ts b/src/commands/agent.ts deleted file mode 100644 index 65d8341..0000000 --- a/src/commands/agent.ts +++ /dev/null @@ -1,433 +0,0 @@ -/** - * Agent command implementation - */ - -import type { - AgentOptions, - AgentResult, - AgentStatusResult, -} from '../types/agent'; -import { getClient } from '../utils/client'; -import { isJobId } from '../utils/job'; -import { writeOutput } from '../utils/output'; -import { createSpinner } from '../utils/spinner'; -import { readFileSync } from 'fs'; - -/** - * Extract detailed error message from API errors - */ -function extractErrorMessage(error: unknown): string { - if (error instanceof Error) { - const anyError = error as any; - - // Handle Firecrawl SDK errors with details array - if (anyError.details && Array.isArray(anyError.details)) { - const messages = anyError.details - .map((d: any) => d.message || JSON.stringify(d)) - .join('; '); - return messages || error.message; - } - - // Check for response data in the error (common in axios/fetch errors) - if (anyError.response?.data?.error) { - return anyError.response.data.error; - } - if (anyError.response?.data?.message) { - return anyError.response.data.message; - } - if (anyError.response?.data) { - return JSON.stringify(anyError.response.data); - } - - return error.message; - } - return 'Unknown error occurred'; -} - -/** - * Load schema from file - */ -function loadSchemaFromFile(filePath: string): Record { - try { - const content = readFileSync(filePath, 'utf-8'); - return JSON.parse(content); - } catch (error) { - if ((error as NodeJS.ErrnoException).code === 'ENOENT') { - throw new Error(`Schema file not found: ${filePath}`); - } - if (error instanceof SyntaxError) { - throw new Error(`Invalid JSON in schema file: ${filePath}`); - } - throw error; - } -} - -/** - * Execute agent status check (with optional wait/polling) - */ -async function checkAgentStatus( - jobId: string, - options: AgentOptions -): Promise { - const app = getClient({ apiKey: options.apiKey, apiUrl: options.apiUrl }); - - // If not waiting, just return current status - if (!options.wait) { - try { - const status = await app.getAgentStatus(jobId); - return { - success: status.success, - data: { - id: jobId, - status: status.status, - data: status.data, - creditsUsed: status.creditsUsed, - expiresAt: status.expiresAt, - }, - }; - } catch (error) { - return { - success: false, - error: extractErrorMessage(error), - }; - } - } - - // Wait mode: poll until completion - const spinner = createSpinner(`Checking agent status...`); - spinner.start(); - - // Handle Ctrl+C gracefully - const handleInterrupt = () => { - spinner.stop(); - process.stderr.write('\n\nInterrupted. Agent may still be running.\n'); - process.stderr.write(`Check status with: firecrawl agent ${jobId}\n\n`); - process.exit(0); - }; - process.on('SIGINT', handleInterrupt); - - const pollMs = options.pollInterval ? options.pollInterval * 1000 : 5000; - const startTime = Date.now(); - const timeoutMs = options.timeout ? options.timeout * 1000 : undefined; - - try { - // Check initial status - let agentStatus = await app.getAgentStatus(jobId); - spinner.update(`Agent ${agentStatus.status}... (Job ID: ${jobId})`); - - while (true) { - if (agentStatus.status === 'completed') { - spinner.succeed('Agent completed'); - return { - success: agentStatus.success, - data: { - id: jobId, - status: agentStatus.status, - data: agentStatus.data, - creditsUsed: agentStatus.creditsUsed, - expiresAt: agentStatus.expiresAt, - }, - }; - } - - if (agentStatus.status === 'failed') { - spinner.fail('Agent failed'); - return { - success: false, - data: { - id: jobId, - status: agentStatus.status, - data: agentStatus.data, - creditsUsed: agentStatus.creditsUsed, - expiresAt: agentStatus.expiresAt, - }, - error: agentStatus.error, - }; - } - - // Check timeout - if (timeoutMs && Date.now() - startTime > timeoutMs) { - spinner.fail(`Timeout after ${options.timeout}s`); - return { - success: false, - error: `Timeout after ${options.timeout} seconds. Agent still processing.`, - }; - } - - await new Promise((resolve) => setTimeout(resolve, pollMs)); - agentStatus = await app.getAgentStatus(jobId); - spinner.update(`Agent ${agentStatus.status}... (Job ID: ${jobId})`); - } - } catch (error) { - spinner.fail('Failed to check agent status'); - return { - success: false, - error: extractErrorMessage(error), - }; - } finally { - process.removeListener('SIGINT', handleInterrupt); - } -} - -/** - * Execute agent command - */ -export async function executeAgent( - options: AgentOptions -): Promise { - try { - const app = getClient({ apiKey: options.apiKey, apiUrl: options.apiUrl }); - const { prompt, status, wait, pollInterval, timeout } = options; - - // If status flag is set or input looks like a job ID, check status - if (status || isJobId(prompt)) { - return await checkAgentStatus(prompt, options); - } - - // Load schema from file if specified - let schema: Record | undefined = options.schema as - | Record - | undefined; - if (options.schemaFile) { - schema = loadSchemaFromFile(options.schemaFile); - } - - // Build agent options - const agentParams: { - prompt: string; - urls?: string[]; - schema?: Record; - model?: 'spark-1-pro' | 'spark-1-mini'; - maxCredits?: number; - pollInterval?: number; - timeout?: number; - integration?: string; - } = { - prompt, - integration: 'cli', - }; - - if (options.urls && options.urls.length > 0) { - agentParams.urls = options.urls; - } - if (schema) { - agentParams.schema = schema; - } - if (options.model) { - agentParams.model = options.model as 'spark-1-pro' | 'spark-1-mini'; - } - if (options.maxCredits !== undefined) { - agentParams.maxCredits = options.maxCredits; - } - - // If wait mode, use polling with spinner - if (wait) { - const spinner = createSpinner('Starting agent...'); - spinner.start(); - - // Start agent first - let response; - try { - response = await app.startAgent(agentParams); - } catch (error) { - spinner.fail('Failed to start agent'); - return { - success: false, - error: extractErrorMessage(error), - }; - } - const jobId = response.id; - - // Handle Ctrl+C gracefully - const handleInterrupt = () => { - spinner.stop(); - process.stderr.write('\n\nInterrupted. Agent is still running.\n'); - process.stderr.write(`Check status with: firecrawl agent ${jobId}\n\n`); - process.exit(0); - }; - process.on('SIGINT', handleInterrupt); - - spinner.update(`Agent running... (Job ID: ${jobId})`); - - // Poll for status - const pollMs = pollInterval ? pollInterval * 1000 : 5000; - const startTime = Date.now(); - const timeoutMs = timeout ? timeout * 1000 : undefined; - - try { - while (true) { - await new Promise((resolve) => setTimeout(resolve, pollMs)); - - const agentStatus = await app.getAgentStatus(jobId); - - if (agentStatus.status === 'completed') { - process.removeListener('SIGINT', handleInterrupt); - spinner.succeed('Agent completed'); - return { - success: agentStatus.success, - data: { - id: jobId, - status: agentStatus.status, - data: agentStatus.data, - creditsUsed: agentStatus.creditsUsed, - expiresAt: agentStatus.expiresAt, - }, - }; - } - - if (agentStatus.status === 'failed') { - process.removeListener('SIGINT', handleInterrupt); - spinner.fail('Agent failed'); - return { - success: false, - data: { - id: jobId, - status: agentStatus.status, - data: agentStatus.data, - creditsUsed: agentStatus.creditsUsed, - expiresAt: agentStatus.expiresAt, - }, - error: agentStatus.error, - }; - } - - // Check timeout - if (timeoutMs && Date.now() - startTime > timeoutMs) { - process.removeListener('SIGINT', handleInterrupt); - spinner.fail(`Timeout after ${timeout}s (Job ID: ${jobId})`); - return { - success: false, - error: `Timeout after ${timeout} seconds. Agent still processing. Job ID: ${jobId}`, - }; - } - } - } finally { - process.removeListener('SIGINT', handleInterrupt); - } - } - - // Otherwise, start agent and return job ID - const spinner = createSpinner('Starting agent...'); - spinner.start(); - - let response; - try { - response = await app.startAgent(agentParams); - } catch (error) { - spinner.fail('Failed to start agent'); - return { - success: false, - error: extractErrorMessage(error), - }; - } - - spinner.succeed(`Agent started (Job ID: ${response.id})`); - - return { - success: response.success, - data: { - jobId: response.id, - status: 'processing', - }, - }; - } catch (error) { - return { - success: false, - error: extractErrorMessage(error), - }; - } -} - -/** - * Format agent status in human-readable way - */ -function formatAgentStatus(data: AgentStatusResult['data']): string { - if (!data) return ''; - - const lines: string[] = []; - lines.push(`Job ID: ${data.id}`); - lines.push(`Status: ${data.status}`); - - if (data.creditsUsed !== undefined) { - lines.push(`Credits Used: ${data.creditsUsed}`); - } - - if (data.expiresAt) { - const expiresDate = new Date(data.expiresAt); - lines.push( - `Expires: ${expiresDate.toLocaleString('en-US', { - year: 'numeric', - month: 'short', - day: 'numeric', - hour: '2-digit', - minute: '2-digit', - })}` - ); - } - - if (data.data) { - lines.push(''); - lines.push('Result:'); - lines.push(JSON.stringify(data.data, null, 2)); - } - - return lines.join('\n') + '\n'; -} - -/** - * Handle agent command output - */ -export async function handleAgentCommand(options: AgentOptions): Promise { - const result = await executeAgent(options); - - if (!result.success) { - console.error('Error:', result.error); - process.exit(1); - } - - // Handle status result (completed agent job with data) - if ('data' in result && result.data && 'data' in result.data) { - const statusResult = result as AgentStatusResult; - if (statusResult.data) { - let outputContent: string; - - if (options.json) { - // JSON format - outputContent = options.pretty - ? JSON.stringify({ success: true, ...statusResult.data }, null, 2) - : JSON.stringify({ success: true, ...statusResult.data }); - } else { - // Human-readable format - outputContent = formatAgentStatus(statusResult.data); - } - - writeOutput(outputContent, options.output, !!options.output); - return; - } - } - - // Handle agent start result (job ID) - const agentResult = result as AgentResult; - if (!agentResult.data) { - return; - } - - let outputContent: string; - - if ('jobId' in agentResult.data) { - const jobData = { - jobId: agentResult.data.jobId, - status: agentResult.data.status, - }; - - outputContent = options.pretty - ? JSON.stringify({ success: true, data: jobData }, null, 2) - : JSON.stringify({ success: true, data: jobData }); - } else { - outputContent = options.pretty - ? JSON.stringify(agentResult.data, null, 2) - : JSON.stringify(agentResult.data); - } - - writeOutput(outputContent, options.output, !!options.output); -} diff --git a/src/index.ts b/src/index.ts index 072aa29..e9fb747 100644 --- a/src/index.ts +++ b/src/index.ts @@ -17,7 +17,6 @@ import { handleCreditUsageCommand } from './commands/credit-usage'; import { handleCrawlCommand } from './commands/crawl'; import { handleMapCommand } from './commands/map'; import { handleSearchCommand } from './commands/search'; -import { handleAgentCommand } from './commands/agent'; import { handleBrowserLaunch, handleBrowserExecute, @@ -60,7 +59,6 @@ const AUTH_REQUIRED_COMMANDS = [ 'crawl', 'map', 'search', - 'agent', 'browser', 'credit-usage', ]; @@ -625,115 +623,6 @@ function createSearchCommand(): Command { return searchCmd; } -/** - * Create and configure the agent command - */ -function createAgentCommand(): Command { - const agentCmd = new Command('agent') - .description('Run an AI agent to extract data from the web') - .argument( - '', - 'Natural language prompt describing data to extract, or job ID to check status' - ) - .option('--urls ', 'Comma-separated URLs to focus extraction on') - .option( - '--model ', - 'Model to use: spark-1-mini (default, cheaper) or spark-1-pro (higher accuracy)' - ) - .option( - '--schema ', - 'JSON schema for structured output (inline JSON string)' - ) - .option( - '--schema-file ', - 'Path to JSON schema file for structured output' - ) - .option( - '--max-credits ', - 'Maximum credits to spend (job fails if exceeded)', - parseInt - ) - .option('--status', 'Check status of existing agent job', false) - .option( - '--wait', - 'Wait for agent to complete before returning results', - false - ) - .option( - '--poll-interval ', - 'Polling interval in seconds when waiting (default: 5)', - parseFloat - ) - .option( - '--timeout ', - 'Timeout in seconds when waiting (default: no timeout)', - parseFloat - ) - .option( - '-k, --api-key ', - 'Firecrawl API key (overrides global --api-key)' - ) - .option('--api-url ', 'API URL (overrides global --api-url)') - .option('-o, --output ', 'Output file path (default: stdout)') - .option('--json', 'Output as JSON format', false) - .option('--pretty', 'Pretty print JSON output', false) - .action(async (promptOrJobId, options) => { - // Auto-detect if it's a job ID (UUID format) - const isStatusCheck = options.status || isJobId(promptOrJobId); - - // Parse URLs - let urls: string[] | undefined; - if (options.urls) { - urls = options.urls - .split(',') - .map((u: string) => u.trim()) - .filter((u: string) => u.length > 0); - } - - // Parse inline schema - let schema: Record | undefined; - if (options.schema) { - try { - schema = JSON.parse(options.schema) as Record; - } catch { - console.error('Error: Invalid JSON in --schema option'); - process.exit(1); - } - } - - // Validate model - const validModels = ['spark-1-pro', 'spark-1-mini']; - if (options.model && !validModels.includes(options.model)) { - console.error( - `Error: Invalid model "${options.model}". Valid models: ${validModels.join(', ')}` - ); - process.exit(1); - } - - const agentOptions = { - prompt: promptOrJobId, - urls, - schema, - schemaFile: options.schemaFile, - model: options.model, - maxCredits: options.maxCredits, - status: isStatusCheck, - wait: options.wait, - pollInterval: options.pollInterval, - timeout: options.timeout, - apiKey: options.apiKey, - apiUrl: options.apiUrl, - output: options.output, - json: options.json, - pretty: options.pretty, - }; - - await handleAgentCommand(agentOptions); - }); - - return agentCmd; -} - /** * Create and configure the browser command */ @@ -1029,11 +918,10 @@ Examples: return browserCmd; } -// Add crawl, map, search, agent, and browser commands to main program +// Add crawl, map, search, and browser commands to main program program.addCommand(createCrawlCommand()); program.addCommand(createMapCommand()); program.addCommand(createSearchCommand()); -program.addCommand(createAgentCommand()); program.addCommand(createBrowserCommand()); // Experimental: AI workflow commands diff --git a/src/types/agent.ts b/src/types/agent.ts deleted file mode 100644 index e2a5c9a..0000000 --- a/src/types/agent.ts +++ /dev/null @@ -1,61 +0,0 @@ -/** - * Types and interfaces for the agent command - */ - -export type AgentModel = 'spark-1-pro' | 'spark-1-mini'; - -export type AgentStatus = 'processing' | 'completed' | 'failed'; - -export interface AgentOptions { - /** Natural language prompt describing the data to extract */ - prompt: string; - /** Model to use: spark-1-mini (default, cheaper) or spark-1-pro (higher accuracy) */ - model?: AgentModel; - /** Specific URLs to focus extraction on */ - urls?: string[]; - /** JSON schema for structured output */ - schema?: Record; - /** Path to JSON schema file */ - schemaFile?: string; - /** Maximum credits to spend (job fails if exceeded) */ - maxCredits?: number; - /** Check status of existing agent job */ - status?: boolean; - /** Wait for agent to complete before returning results */ - wait?: boolean; - /** Polling interval in seconds when waiting */ - pollInterval?: number; - /** Timeout in seconds when waiting */ - timeout?: number; - /** API key for Firecrawl */ - apiKey?: string; - /** API URL for Firecrawl */ - apiUrl?: string; - /** Output file path */ - output?: string; - /** Pretty print JSON output */ - pretty?: boolean; - /** Force JSON output */ - json?: boolean; -} - -export interface AgentResult { - success: boolean; - data?: { - jobId: string; - status: AgentStatus; - }; - error?: string; -} - -export interface AgentStatusResult { - success: boolean; - data?: { - id: string; - status: AgentStatus; - data?: any; - creditsUsed?: number; - expiresAt?: string; - }; - error?: string; -} From 8d50103c5bc24aa73d661ac715ffc72f50b43549 Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Thu, 12 Mar 2026 12:23:07 -0400 Subject: [PATCH 4/4] keep agent command, only remove the agent skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the CLI agent command stays — only the firecrawl-agent skill and agent references in other skills are removed. --- README.md | 68 +++++ package.json | 2 +- src/__tests__/utils/mock-client.ts | 1 + src/commands/agent.ts | 433 +++++++++++++++++++++++++++++ src/index.ts | 114 +++++++- src/types/agent.ts | 61 ++++ 6 files changed, 677 insertions(+), 2 deletions(-) create mode 100644 src/commands/agent.ts create mode 100644 src/types/agent.ts diff --git a/README.md b/README.md index 76e2bfb..c920367 100644 --- a/README.md +++ b/README.md @@ -397,6 +397,74 @@ firecrawl credit-usage --json --pretty --- +### `agent` - AI-powered web data extraction + +Run an AI agent that autonomously browses and extracts structured data from the web based on natural language prompts. + +> **Note:** Agent tasks typically take **2 to 5 minutes** to complete, and sometimes longer for complex extractions. Use sparingly and consider `--max-credits` to limit costs. + +```bash +# Basic usage (returns job ID immediately) +firecrawl agent "Find the pricing plans for Firecrawl" + +# Wait for completion +firecrawl agent "Extract all product names and prices from this store" --wait + +# Focus on specific URLs +firecrawl agent "Get the main features listed" --urls https://example.com/features + +# Use structured output with JSON schema +firecrawl agent "Extract company info" --schema '{"type":"object","properties":{"name":{"type":"string"},"employees":{"type":"number"}}}' + +# Load schema from file +firecrawl agent "Extract product data" --schema-file ./product-schema.json --wait + +# Check status of an existing job +firecrawl agent +firecrawl agent --wait +``` + +#### Agent Options + +| Option | Description | +| --------------------------- | ------------------------------------------------------------- | +| `--urls ` | Comma-separated URLs to focus extraction on | +| `--model ` | `spark-1-mini` (default, cheaper) or `spark-1-pro` (accurate) | +| `--schema ` | JSON schema for structured output (inline JSON string) | +| `--schema-file ` | Path to JSON schema file for structured output | +| `--max-credits ` | Maximum credits to spend (job fails if exceeded) | +| `--status` | Check status of existing agent job | +| `--wait` | Wait for agent to complete before returning results | +| `--poll-interval ` | Polling interval in seconds when waiting (default: 5) | +| `--timeout ` | Timeout in seconds when waiting (default: no timeout) | +| `-o, --output ` | Save output to file | +| `--json` | Output as JSON format | +| `--pretty` | Pretty print JSON output | + +#### Examples + +```bash +# Research task with timeout +firecrawl agent "Find the top 5 competitors of Notion and their pricing" --wait --timeout 300 + +# Extract data with cost limit +firecrawl agent "Get all blog post titles and dates" --urls https://blog.example.com --max-credits 100 --wait + +# Use higher accuracy model for complex extraction +firecrawl agent "Extract detailed technical specifications" --model spark-1-pro --wait --pretty + +# Save structured results to file +firecrawl agent "Extract contact information" --schema-file ./contact-schema.json --wait -o contacts.json --pretty + +# Check job status without waiting +firecrawl agent abc123-def456-... --json + +# Poll a running job until completion +firecrawl agent abc123-def456-... --wait --poll-interval 10 +``` + +--- + ### `browser` - Browser sandbox sessions (Beta) Launch and control cloud browser sessions. By default, commands are sent to agent-browser (pre-installed in every sandbox). Use `--python` or `--node` to run Playwright code directly instead. diff --git a/package.json b/package.json index 8e5ed39..ae708d7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "firecrawl-cli", - "version": "1.11.0", + "version": "1.10.0", "description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.", "main": "dist/index.js", "bin": { diff --git a/src/__tests__/utils/mock-client.ts b/src/__tests__/utils/mock-client.ts index 96c8bee..6165d6e 100644 --- a/src/__tests__/utils/mock-client.ts +++ b/src/__tests__/utils/mock-client.ts @@ -14,6 +14,7 @@ export interface MockFirecrawlClient { crawl?: any; map?: any; extract?: any; + agent?: any; } /** diff --git a/src/commands/agent.ts b/src/commands/agent.ts new file mode 100644 index 0000000..65d8341 --- /dev/null +++ b/src/commands/agent.ts @@ -0,0 +1,433 @@ +/** + * Agent command implementation + */ + +import type { + AgentOptions, + AgentResult, + AgentStatusResult, +} from '../types/agent'; +import { getClient } from '../utils/client'; +import { isJobId } from '../utils/job'; +import { writeOutput } from '../utils/output'; +import { createSpinner } from '../utils/spinner'; +import { readFileSync } from 'fs'; + +/** + * Extract detailed error message from API errors + */ +function extractErrorMessage(error: unknown): string { + if (error instanceof Error) { + const anyError = error as any; + + // Handle Firecrawl SDK errors with details array + if (anyError.details && Array.isArray(anyError.details)) { + const messages = anyError.details + .map((d: any) => d.message || JSON.stringify(d)) + .join('; '); + return messages || error.message; + } + + // Check for response data in the error (common in axios/fetch errors) + if (anyError.response?.data?.error) { + return anyError.response.data.error; + } + if (anyError.response?.data?.message) { + return anyError.response.data.message; + } + if (anyError.response?.data) { + return JSON.stringify(anyError.response.data); + } + + return error.message; + } + return 'Unknown error occurred'; +} + +/** + * Load schema from file + */ +function loadSchemaFromFile(filePath: string): Record { + try { + const content = readFileSync(filePath, 'utf-8'); + return JSON.parse(content); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === 'ENOENT') { + throw new Error(`Schema file not found: ${filePath}`); + } + if (error instanceof SyntaxError) { + throw new Error(`Invalid JSON in schema file: ${filePath}`); + } + throw error; + } +} + +/** + * Execute agent status check (with optional wait/polling) + */ +async function checkAgentStatus( + jobId: string, + options: AgentOptions +): Promise { + const app = getClient({ apiKey: options.apiKey, apiUrl: options.apiUrl }); + + // If not waiting, just return current status + if (!options.wait) { + try { + const status = await app.getAgentStatus(jobId); + return { + success: status.success, + data: { + id: jobId, + status: status.status, + data: status.data, + creditsUsed: status.creditsUsed, + expiresAt: status.expiresAt, + }, + }; + } catch (error) { + return { + success: false, + error: extractErrorMessage(error), + }; + } + } + + // Wait mode: poll until completion + const spinner = createSpinner(`Checking agent status...`); + spinner.start(); + + // Handle Ctrl+C gracefully + const handleInterrupt = () => { + spinner.stop(); + process.stderr.write('\n\nInterrupted. Agent may still be running.\n'); + process.stderr.write(`Check status with: firecrawl agent ${jobId}\n\n`); + process.exit(0); + }; + process.on('SIGINT', handleInterrupt); + + const pollMs = options.pollInterval ? options.pollInterval * 1000 : 5000; + const startTime = Date.now(); + const timeoutMs = options.timeout ? options.timeout * 1000 : undefined; + + try { + // Check initial status + let agentStatus = await app.getAgentStatus(jobId); + spinner.update(`Agent ${agentStatus.status}... (Job ID: ${jobId})`); + + while (true) { + if (agentStatus.status === 'completed') { + spinner.succeed('Agent completed'); + return { + success: agentStatus.success, + data: { + id: jobId, + status: agentStatus.status, + data: agentStatus.data, + creditsUsed: agentStatus.creditsUsed, + expiresAt: agentStatus.expiresAt, + }, + }; + } + + if (agentStatus.status === 'failed') { + spinner.fail('Agent failed'); + return { + success: false, + data: { + id: jobId, + status: agentStatus.status, + data: agentStatus.data, + creditsUsed: agentStatus.creditsUsed, + expiresAt: agentStatus.expiresAt, + }, + error: agentStatus.error, + }; + } + + // Check timeout + if (timeoutMs && Date.now() - startTime > timeoutMs) { + spinner.fail(`Timeout after ${options.timeout}s`); + return { + success: false, + error: `Timeout after ${options.timeout} seconds. Agent still processing.`, + }; + } + + await new Promise((resolve) => setTimeout(resolve, pollMs)); + agentStatus = await app.getAgentStatus(jobId); + spinner.update(`Agent ${agentStatus.status}... (Job ID: ${jobId})`); + } + } catch (error) { + spinner.fail('Failed to check agent status'); + return { + success: false, + error: extractErrorMessage(error), + }; + } finally { + process.removeListener('SIGINT', handleInterrupt); + } +} + +/** + * Execute agent command + */ +export async function executeAgent( + options: AgentOptions +): Promise { + try { + const app = getClient({ apiKey: options.apiKey, apiUrl: options.apiUrl }); + const { prompt, status, wait, pollInterval, timeout } = options; + + // If status flag is set or input looks like a job ID, check status + if (status || isJobId(prompt)) { + return await checkAgentStatus(prompt, options); + } + + // Load schema from file if specified + let schema: Record | undefined = options.schema as + | Record + | undefined; + if (options.schemaFile) { + schema = loadSchemaFromFile(options.schemaFile); + } + + // Build agent options + const agentParams: { + prompt: string; + urls?: string[]; + schema?: Record; + model?: 'spark-1-pro' | 'spark-1-mini'; + maxCredits?: number; + pollInterval?: number; + timeout?: number; + integration?: string; + } = { + prompt, + integration: 'cli', + }; + + if (options.urls && options.urls.length > 0) { + agentParams.urls = options.urls; + } + if (schema) { + agentParams.schema = schema; + } + if (options.model) { + agentParams.model = options.model as 'spark-1-pro' | 'spark-1-mini'; + } + if (options.maxCredits !== undefined) { + agentParams.maxCredits = options.maxCredits; + } + + // If wait mode, use polling with spinner + if (wait) { + const spinner = createSpinner('Starting agent...'); + spinner.start(); + + // Start agent first + let response; + try { + response = await app.startAgent(agentParams); + } catch (error) { + spinner.fail('Failed to start agent'); + return { + success: false, + error: extractErrorMessage(error), + }; + } + const jobId = response.id; + + // Handle Ctrl+C gracefully + const handleInterrupt = () => { + spinner.stop(); + process.stderr.write('\n\nInterrupted. Agent is still running.\n'); + process.stderr.write(`Check status with: firecrawl agent ${jobId}\n\n`); + process.exit(0); + }; + process.on('SIGINT', handleInterrupt); + + spinner.update(`Agent running... (Job ID: ${jobId})`); + + // Poll for status + const pollMs = pollInterval ? pollInterval * 1000 : 5000; + const startTime = Date.now(); + const timeoutMs = timeout ? timeout * 1000 : undefined; + + try { + while (true) { + await new Promise((resolve) => setTimeout(resolve, pollMs)); + + const agentStatus = await app.getAgentStatus(jobId); + + if (agentStatus.status === 'completed') { + process.removeListener('SIGINT', handleInterrupt); + spinner.succeed('Agent completed'); + return { + success: agentStatus.success, + data: { + id: jobId, + status: agentStatus.status, + data: agentStatus.data, + creditsUsed: agentStatus.creditsUsed, + expiresAt: agentStatus.expiresAt, + }, + }; + } + + if (agentStatus.status === 'failed') { + process.removeListener('SIGINT', handleInterrupt); + spinner.fail('Agent failed'); + return { + success: false, + data: { + id: jobId, + status: agentStatus.status, + data: agentStatus.data, + creditsUsed: agentStatus.creditsUsed, + expiresAt: agentStatus.expiresAt, + }, + error: agentStatus.error, + }; + } + + // Check timeout + if (timeoutMs && Date.now() - startTime > timeoutMs) { + process.removeListener('SIGINT', handleInterrupt); + spinner.fail(`Timeout after ${timeout}s (Job ID: ${jobId})`); + return { + success: false, + error: `Timeout after ${timeout} seconds. Agent still processing. Job ID: ${jobId}`, + }; + } + } + } finally { + process.removeListener('SIGINT', handleInterrupt); + } + } + + // Otherwise, start agent and return job ID + const spinner = createSpinner('Starting agent...'); + spinner.start(); + + let response; + try { + response = await app.startAgent(agentParams); + } catch (error) { + spinner.fail('Failed to start agent'); + return { + success: false, + error: extractErrorMessage(error), + }; + } + + spinner.succeed(`Agent started (Job ID: ${response.id})`); + + return { + success: response.success, + data: { + jobId: response.id, + status: 'processing', + }, + }; + } catch (error) { + return { + success: false, + error: extractErrorMessage(error), + }; + } +} + +/** + * Format agent status in human-readable way + */ +function formatAgentStatus(data: AgentStatusResult['data']): string { + if (!data) return ''; + + const lines: string[] = []; + lines.push(`Job ID: ${data.id}`); + lines.push(`Status: ${data.status}`); + + if (data.creditsUsed !== undefined) { + lines.push(`Credits Used: ${data.creditsUsed}`); + } + + if (data.expiresAt) { + const expiresDate = new Date(data.expiresAt); + lines.push( + `Expires: ${expiresDate.toLocaleString('en-US', { + year: 'numeric', + month: 'short', + day: 'numeric', + hour: '2-digit', + minute: '2-digit', + })}` + ); + } + + if (data.data) { + lines.push(''); + lines.push('Result:'); + lines.push(JSON.stringify(data.data, null, 2)); + } + + return lines.join('\n') + '\n'; +} + +/** + * Handle agent command output + */ +export async function handleAgentCommand(options: AgentOptions): Promise { + const result = await executeAgent(options); + + if (!result.success) { + console.error('Error:', result.error); + process.exit(1); + } + + // Handle status result (completed agent job with data) + if ('data' in result && result.data && 'data' in result.data) { + const statusResult = result as AgentStatusResult; + if (statusResult.data) { + let outputContent: string; + + if (options.json) { + // JSON format + outputContent = options.pretty + ? JSON.stringify({ success: true, ...statusResult.data }, null, 2) + : JSON.stringify({ success: true, ...statusResult.data }); + } else { + // Human-readable format + outputContent = formatAgentStatus(statusResult.data); + } + + writeOutput(outputContent, options.output, !!options.output); + return; + } + } + + // Handle agent start result (job ID) + const agentResult = result as AgentResult; + if (!agentResult.data) { + return; + } + + let outputContent: string; + + if ('jobId' in agentResult.data) { + const jobData = { + jobId: agentResult.data.jobId, + status: agentResult.data.status, + }; + + outputContent = options.pretty + ? JSON.stringify({ success: true, data: jobData }, null, 2) + : JSON.stringify({ success: true, data: jobData }); + } else { + outputContent = options.pretty + ? JSON.stringify(agentResult.data, null, 2) + : JSON.stringify(agentResult.data); + } + + writeOutput(outputContent, options.output, !!options.output); +} diff --git a/src/index.ts b/src/index.ts index e9fb747..072aa29 100644 --- a/src/index.ts +++ b/src/index.ts @@ -17,6 +17,7 @@ import { handleCreditUsageCommand } from './commands/credit-usage'; import { handleCrawlCommand } from './commands/crawl'; import { handleMapCommand } from './commands/map'; import { handleSearchCommand } from './commands/search'; +import { handleAgentCommand } from './commands/agent'; import { handleBrowserLaunch, handleBrowserExecute, @@ -59,6 +60,7 @@ const AUTH_REQUIRED_COMMANDS = [ 'crawl', 'map', 'search', + 'agent', 'browser', 'credit-usage', ]; @@ -623,6 +625,115 @@ function createSearchCommand(): Command { return searchCmd; } +/** + * Create and configure the agent command + */ +function createAgentCommand(): Command { + const agentCmd = new Command('agent') + .description('Run an AI agent to extract data from the web') + .argument( + '', + 'Natural language prompt describing data to extract, or job ID to check status' + ) + .option('--urls ', 'Comma-separated URLs to focus extraction on') + .option( + '--model ', + 'Model to use: spark-1-mini (default, cheaper) or spark-1-pro (higher accuracy)' + ) + .option( + '--schema ', + 'JSON schema for structured output (inline JSON string)' + ) + .option( + '--schema-file ', + 'Path to JSON schema file for structured output' + ) + .option( + '--max-credits ', + 'Maximum credits to spend (job fails if exceeded)', + parseInt + ) + .option('--status', 'Check status of existing agent job', false) + .option( + '--wait', + 'Wait for agent to complete before returning results', + false + ) + .option( + '--poll-interval ', + 'Polling interval in seconds when waiting (default: 5)', + parseFloat + ) + .option( + '--timeout ', + 'Timeout in seconds when waiting (default: no timeout)', + parseFloat + ) + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('--api-url ', 'API URL (overrides global --api-url)') + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--json', 'Output as JSON format', false) + .option('--pretty', 'Pretty print JSON output', false) + .action(async (promptOrJobId, options) => { + // Auto-detect if it's a job ID (UUID format) + const isStatusCheck = options.status || isJobId(promptOrJobId); + + // Parse URLs + let urls: string[] | undefined; + if (options.urls) { + urls = options.urls + .split(',') + .map((u: string) => u.trim()) + .filter((u: string) => u.length > 0); + } + + // Parse inline schema + let schema: Record | undefined; + if (options.schema) { + try { + schema = JSON.parse(options.schema) as Record; + } catch { + console.error('Error: Invalid JSON in --schema option'); + process.exit(1); + } + } + + // Validate model + const validModels = ['spark-1-pro', 'spark-1-mini']; + if (options.model && !validModels.includes(options.model)) { + console.error( + `Error: Invalid model "${options.model}". Valid models: ${validModels.join(', ')}` + ); + process.exit(1); + } + + const agentOptions = { + prompt: promptOrJobId, + urls, + schema, + schemaFile: options.schemaFile, + model: options.model, + maxCredits: options.maxCredits, + status: isStatusCheck, + wait: options.wait, + pollInterval: options.pollInterval, + timeout: options.timeout, + apiKey: options.apiKey, + apiUrl: options.apiUrl, + output: options.output, + json: options.json, + pretty: options.pretty, + }; + + await handleAgentCommand(agentOptions); + }); + + return agentCmd; +} + /** * Create and configure the browser command */ @@ -918,10 +1029,11 @@ Examples: return browserCmd; } -// Add crawl, map, search, and browser commands to main program +// Add crawl, map, search, agent, and browser commands to main program program.addCommand(createCrawlCommand()); program.addCommand(createMapCommand()); program.addCommand(createSearchCommand()); +program.addCommand(createAgentCommand()); program.addCommand(createBrowserCommand()); // Experimental: AI workflow commands diff --git a/src/types/agent.ts b/src/types/agent.ts new file mode 100644 index 0000000..e2a5c9a --- /dev/null +++ b/src/types/agent.ts @@ -0,0 +1,61 @@ +/** + * Types and interfaces for the agent command + */ + +export type AgentModel = 'spark-1-pro' | 'spark-1-mini'; + +export type AgentStatus = 'processing' | 'completed' | 'failed'; + +export interface AgentOptions { + /** Natural language prompt describing the data to extract */ + prompt: string; + /** Model to use: spark-1-mini (default, cheaper) or spark-1-pro (higher accuracy) */ + model?: AgentModel; + /** Specific URLs to focus extraction on */ + urls?: string[]; + /** JSON schema for structured output */ + schema?: Record; + /** Path to JSON schema file */ + schemaFile?: string; + /** Maximum credits to spend (job fails if exceeded) */ + maxCredits?: number; + /** Check status of existing agent job */ + status?: boolean; + /** Wait for agent to complete before returning results */ + wait?: boolean; + /** Polling interval in seconds when waiting */ + pollInterval?: number; + /** Timeout in seconds when waiting */ + timeout?: number; + /** API key for Firecrawl */ + apiKey?: string; + /** API URL for Firecrawl */ + apiUrl?: string; + /** Output file path */ + output?: string; + /** Pretty print JSON output */ + pretty?: boolean; + /** Force JSON output */ + json?: boolean; +} + +export interface AgentResult { + success: boolean; + data?: { + jobId: string; + status: AgentStatus; + }; + error?: string; +} + +export interface AgentStatusResult { + success: boolean; + data?: { + id: string; + status: AgentStatus; + data?: any; + creditsUsed?: number; + expiresAt?: string; + }; + error?: string; +}