From 07dcf3093ce1f72a3a9771a342204cbff365b977 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 10 Jan 2026 20:33:25 +0100 Subject: [PATCH 1/4] Update server.py --- src/scrapegraph_mcp/server.py | 311 ++++++++++++++++++++++++++++++++-- 1 file changed, 299 insertions(+), 12 deletions(-) diff --git a/src/scrapegraph_mcp/server.py b/src/scrapegraph_mcp/server.py index 906a224..83446eb 100644 --- a/src/scrapegraph_mcp/server.py +++ b/src/scrapegraph_mcp/server.py @@ -90,20 +90,42 @@ def __init__(self, api_key: str): self.client = httpx.Client(timeout=httpx.Timeout(120.0)) - def markdownify(self, website_url: str) -> Dict[str, Any]: + def markdownify( + self, + website_url: str, + headers: Optional[Dict[str, str]] = None, + steps: Optional[List[str]] = None, + stealth: Optional[bool] = None, + stream: Optional[bool] = None, + mock: Optional[bool] = None + ) -> Dict[str, Any]: """ Convert a webpage into clean, formatted markdown. Args: website_url: URL of the webpage to convert + headers: HTTP headers to include in the request (optional) + steps: Interaction steps before conversion (optional) + stealth: Enable stealth mode to avoid bot detection (optional) + stream: Enable streaming response for real-time updates (optional) + mock: Return mock data for testing purposes (optional) Returns: Dictionary containing the markdown result """ url = f"{self.BASE_URL}/markdownify" - data = { - "website_url": website_url - } + data = {"website_url": website_url} + + if headers is not None: + data["headers"] = headers + if steps is not None: + data["steps"] = steps + if stealth is not None: + data["stealth"] = stealth + if stream is not None: + data["stream"] = stream + if mock is not None: + data["mock"] = mock response = self.client.post(url, headers=self.headers, json=data) @@ -113,6 +135,25 @@ def markdownify(self, website_url: str) -> Dict[str, Any]: return response.json() + def markdownify_status(self, request_id: str) -> Dict[str, Any]: + """ + Get the status of a markdownify request. + + Args: + request_id: The request ID to check status for + + Returns: + Dictionary containing the request status and results + """ + url = f"{self.BASE_URL}/markdownify/{request_id}" + response = self.client.get(url, headers=self.headers) + + if response.status_code != 200: + error_msg = f"Error {response.status_code}: {response.text}" + raise Exception(error_msg) + + return response.json() + def smartscraper( self, user_prompt: str, @@ -175,6 +216,25 @@ def smartscraper( return response.json() + def smartscraper_status(self, request_id: str) -> Dict[str, Any]: + """ + Get the status of a smartscraper request. + + Args: + request_id: The request ID to check status for + + Returns: + Dictionary containing the request status and results + """ + url = f"{self.BASE_URL}/smartscraper/{request_id}" + response = self.client.get(url, headers=self.headers) + + if response.status_code != 200: + error_msg = f"Error {response.status_code}: {response.text}" + raise Exception(error_msg) + + return response.json() + def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scrolls: int = None) -> Dict[str, Any]: """ Perform AI-powered web searches with structured results. @@ -208,13 +268,23 @@ def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scr return response.json() - def scrape(self, website_url: str, render_heavy_js: Optional[bool] = None) -> Dict[str, Any]: + def scrape( + self, + website_url: str, + render_heavy_js: Optional[bool] = None, + mock: Optional[bool] = None, + stealth: Optional[bool] = None, + stream: Optional[bool] = None + ) -> Dict[str, Any]: """ Basic scrape endpoint to fetch page content. Args: website_url: URL to scrape render_heavy_js: Whether to render heavy JS (optional) + mock: Return mock data for testing purposes (optional) + stealth: Enable stealth mode to avoid bot detection (optional) + stream: Enable streaming response for real-time updates (optional) Returns: Dictionary containing the scraped result @@ -223,23 +293,32 @@ def scrape(self, website_url: str, render_heavy_js: Optional[bool] = None) -> Di payload: Dict[str, Any] = {"website_url": website_url} if render_heavy_js is not None: payload["render_heavy_js"] = render_heavy_js + if mock is not None: + payload["mock"] = mock + if stealth is not None: + payload["stealth"] = stealth + if stream is not None: + payload["stream"] = stream response = self.client.post(url, headers=self.headers, json=payload) response.raise_for_status() return response.json() - def sitemap(self, website_url: str) -> Dict[str, Any]: + def sitemap(self, website_url: str, stream: Optional[bool] = None) -> Dict[str, Any]: """ Extract sitemap for a given website. Args: website_url: Base website URL + stream: Enable streaming response for real-time updates (optional) Returns: Dictionary containing sitemap URLs/structure """ url = f"{self.BASE_URL}/sitemap" payload: Dict[str, Any] = {"website_url": website_url} + if stream is not None: + payload["stream"] = stream response = self.client.post(url, headers=self.headers, json=payload) response.raise_for_status() @@ -1353,7 +1432,15 @@ def tool_comparison_guide() -> str: # Add tool for markdownify @mcp.tool(annotations={"readOnlyHint": True, "destructiveHint": False, "idempotentHint": True}) -def markdownify(website_url: str, ctx: Context) -> Dict[str, Any]: +def markdownify( + website_url: str, + ctx: Context, + headers: Optional[Dict[str, str]] = None, + steps: Optional[List[str]] = None, + stealth: Optional[bool] = None, + stream: Optional[bool] = None, + mock: Optional[bool] = None +) -> Dict[str, Any]: """ Convert a webpage into clean, formatted markdown. @@ -1375,6 +1462,49 @@ def markdownify(website_url: str, ctx: Context) -> Dict[str, Any]: * ftp://example.com (unsupported protocol) * localhost:3000 (missing protocol) + headers (Optional[Dict[str, str]]): HTTP headers to include in the request. + - Custom headers to send with the HTTP request + - Useful for authentication, custom user agents, or API requirements + - Examples: + * {"User-Agent": "Custom Browser"} + * {"Authorization": "Bearer token"} + * {"Accept-Language": "en-US"} + - Default: None (uses standard headers) + + steps (Optional[List[str]]): Website interaction steps before conversion. + - List of actions to perform before extracting markdown content + - Useful for interacting with dynamic pages or navigating through content + - Examples: + * ["Click 'Show More' button", "Wait for content to load"] + * ["Navigate to About section", "Expand all details"] + * ["Accept cookies", "Close popup"] + - Default: None (no interaction steps) + + stealth (Optional[bool]): Enable stealth mode to avoid bot detection. + - Default: false (standard request) + - Set to true to bypass basic anti-scraping measures + - Uses techniques to appear more like a human browser + - When to use true: + * Sites with bot detection systems + * Sites that block automated requests + * Protected content that requires human-like behavior + - Note: May increase processing time and is not 100% guaranteed + + stream (Optional[bool]): Enable streaming response for real-time updates. + - Default: false (standard response) + - Set to true for streaming mode to receive data as it's being processed + - Useful for monitoring progress on large pages or slow conversions + - Provides real-time feedback during the markdown conversion + + mock (Optional[bool]): Return mock data for testing purposes. + - Default: false (real conversion) + - Set to true to receive mock/sample markdown instead of actually converting the page + - Useful for testing and development without consuming credits + - When to use true: + * Testing your integration without making real requests + * Prototyping workflows before production use + * Development and debugging scenarios + Returns: Dictionary containing: - markdown: The converted markdown content as a string @@ -1390,7 +1520,67 @@ def markdownify(website_url: str, ctx: Context) -> Dict[str, Any]: try: api_key = get_api_key(ctx) client = ScapeGraphClient(api_key) - return client.markdownify(website_url) + return client.markdownify( + website_url=website_url, + headers=headers, + steps=steps, + stealth=stealth, + stream=stream, + mock=mock + ) + except Exception as e: + return {"error": str(e)} + + +# Add tool for markdownify status +@mcp.tool(annotations={"readOnlyHint": True, "destructiveHint": False, "idempotentHint": True}) +def markdownify_status(request_id: str, ctx: Context) -> Dict[str, Any]: + """ + Get the status and results of a markdownify conversion request. + + This tool retrieves the status of a previously initiated markdown conversion using the request_id. + Use this when you need to check the status or retrieve results of an asynchronous markdownify operation. + Read-only operation with no side effects. + + Args: + request_id (str): The unique request identifier returned by a previous markdownify call. + - Format: UUID string (e.g., "123e4567-e89b-12d3-a456-426614174000") + - Used to track and retrieve specific conversion results + - Each markdownify operation may return a request_id for status checking + - Examples: + * "7f3d8a9c-1234-5678-9abc-def012345678" + * "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + Returns: + Dictionary containing: + - request_id: The request identifier + - status: Current status of the conversion ("queued", "processing", "completed", "failed") + - result: The converted markdown content (when status is "completed") + - website_url: The URL that was converted + - error: Error message if status is "failed" (empty string otherwise) + - processing_time: Time taken for the conversion (when completed) + - credits_used: Number of credits consumed + + Raises: + ValueError: If request_id is malformed or invalid + HTTPError: If the request cannot be found (404) or server error occurs + + Use Cases: + - Checking the status of long-running markdown conversions + - Retrieving results from asynchronous markdownify operations + - Monitoring conversion progress for large or complex pages + - Verifying completion before proceeding with next steps + + Note: + - Some markdownify operations may complete synchronously and not require status checks + - If status is "processing" or "queued", poll this endpoint again after a delay + - Once status is "completed", the result field will contain the markdown content + - Failed requests will have status "failed" and an error message in the error field + """ + try: + api_key = get_api_key(ctx) + client = ScapeGraphClient(api_key) + return client.markdownify_status(request_id=request_id) except Exception as e: return {"error": str(e)} @@ -1576,6 +1766,63 @@ def smartscraper( return {"error": str(e)} +# Add tool for smartscraper status +@mcp.tool(annotations={"readOnlyHint": True, "destructiveHint": False, "idempotentHint": True}) +def smartscraper_status(request_id: str, ctx: Context) -> Dict[str, Any]: + """ + Get the status and results of a smartscraper extraction request. + + This tool retrieves the status of a previously initiated AI-powered data extraction using the request_id. + Use this when you need to check the status or retrieve results of an asynchronous smartscraper operation. + Read-only operation with no side effects. + + Args: + request_id (str): The unique request identifier returned by a previous smartscraper call. + - Format: UUID string (e.g., "123e4567-e89b-12d3-a456-426614174000") + - Used to track and retrieve specific extraction results + - Each smartscraper operation may return a request_id for status checking + - Examples: + * "7f3d8a9c-1234-5678-9abc-def012345678" + * "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + Returns: + Dictionary containing: + - request_id: The request identifier + - status: Current status of the extraction ("queued", "processing", "completed", "failed") + - result: The extracted structured data (when status is "completed") + - website_url: The URL that was scraped (if applicable) + - user_prompt: The original extraction prompt + - error: Error message if status is "failed" (empty string otherwise) + - processing_time: Time taken for the extraction (when completed) + - credits_used: Number of credits consumed + - pages_processed: Number of pages analyzed + + Raises: + ValueError: If request_id is malformed or invalid + HTTPError: If the request cannot be found (404) or server error occurs + + Use Cases: + - Checking the status of long-running data extractions + - Retrieving results from asynchronous smartscraper operations + - Monitoring extraction progress for complex or multi-page scraping + - Verifying completion before proceeding with next steps + - Handling extraction errors and retries + + Note: + - Some smartscraper operations may complete synchronously and not require status checks + - If status is "processing" or "queued", poll this endpoint again after a delay + - Once status is "completed", the result field will contain the extracted structured data + - Failed requests will have status "failed" and an error message in the error field + - The extracted data format depends on the output_schema provided in the original request + """ + try: + api_key = get_api_key(ctx) + client = ScapeGraphClient(api_key) + return client.smartscraper_status(request_id=request_id) + except Exception as e: + return {"error": str(e)} + + # Add tool for searchscraper @mcp.tool(annotations={"readOnlyHint": True, "destructiveHint": False, "idempotentHint": False}) def searchscraper( @@ -1851,7 +2098,10 @@ def smartcrawler_fetch_results(request_id: str, ctx: Context) -> Dict[str, Any]: def scrape( website_url: str, ctx: Context, - render_heavy_js: Optional[bool] = None + render_heavy_js: Optional[bool] = None, + mock: Optional[bool] = None, + stealth: Optional[bool] = None, + stream: Optional[bool] = None ) -> Dict[str, Any]: """ Fetch raw page content from any URL with optional JavaScript rendering. @@ -1896,6 +2146,31 @@ def scrape( * true: 15-30 seconds processing time (waits for JS execution) - Cost: Same (1 credit) regardless of render_heavy_js setting + mock (Optional[bool]): Return mock data for testing purposes. + - Default: false (real scraping) + - Set to true to receive mock/sample data instead of actually scraping the website + - Useful for testing and development without consuming credits or hitting rate limits + - When to use true: + * Testing your integration without making real requests + * Prototyping workflows before production use + * Development and debugging scenarios + + stealth (Optional[bool]): Enable stealth mode to avoid bot detection. + - Default: false (standard scraping) + - Set to true to bypass basic anti-scraping measures + - Uses techniques to appear more like a human browser + - When to use true: + * Sites with bot detection systems + * E-commerce sites with protection + * Sites that block automated requests + - Note: May increase processing time and is not 100% guaranteed + + stream (Optional[bool]): Enable streaming response for real-time updates. + - Default: false (standard response) + - Set to true for streaming mode to receive data as it's being processed + - Useful for monitoring progress on large or slow-loading pages + - Provides real-time feedback during the scraping operation + Returns: Dictionary containing: - html_content: The raw HTML content of the page as a string @@ -1929,7 +2204,13 @@ def scrape( try: api_key = get_api_key(ctx) client = ScapeGraphClient(api_key) - return client.scrape(website_url=website_url, render_heavy_js=render_heavy_js) + return client.scrape( + website_url=website_url, + render_heavy_js=render_heavy_js, + mock=mock, + stealth=stealth, + stream=stream + ) except httpx.HTTPError as http_err: return {"error": str(http_err)} except ValueError as val_err: @@ -1938,7 +2219,7 @@ def scrape( # Add tool for sitemap extraction @mcp.tool(annotations={"readOnlyHint": True, "destructiveHint": False, "idempotentHint": True}) -def sitemap(website_url: str, ctx: Context) -> Dict[str, Any]: +def sitemap(website_url: str, ctx: Context, stream: Optional[bool] = None) -> Dict[str, Any]: """ Extract and discover the complete sitemap structure of any website. @@ -1967,6 +2248,12 @@ def sitemap(website_url: str, ctx: Context) -> Dict[str, Any]: * Discovers pages through internal link analysis * Identifies common URL patterns and structures + stream (Optional[bool]): Enable streaming response for real-time updates. + - Default: false (standard response) + - Set to true for streaming mode to receive updates as they are discovered + - Useful for large sites where discovery may take significant time + - Provides progress updates during the sitemap extraction process + Returns: Dictionary containing: - discovered_urls: List of all URLs found on the website @@ -2012,7 +2299,7 @@ def sitemap(website_url: str, ctx: Context) -> Dict[str, Any]: try: api_key = get_api_key(ctx) client = ScapeGraphClient(api_key) - return client.sitemap(website_url=website_url) + return client.sitemap(website_url=website_url, stream=stream) except httpx.HTTPError as http_err: return {"error": str(http_err)} except ValueError as val_err: From 46379f246d28a90e1bfae03c0e071fe82b438b88 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 28 Jan 2026 08:44:20 +0100 Subject: [PATCH 2/4] Add time_range parameter to searchscraper Add support for filtering search results by time range in the searchscraper tool. This aligns with the API changes in sgai-api#386. The time_range parameter accepts: past_hour, past_24_hours, past_week, past_month, or past_year to filter search results by recency. Co-Authored-By: Claude Opus 4.5 --- src/scrapegraph_mcp/server.py | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/src/scrapegraph_mcp/server.py b/src/scrapegraph_mcp/server.py index 83446eb..857a660 100644 --- a/src/scrapegraph_mcp/server.py +++ b/src/scrapegraph_mcp/server.py @@ -55,7 +55,7 @@ import json import logging import os -from typing import Any, Dict, Optional, List, Union, Annotated +from typing import Any, Dict, Optional, List, Union, Annotated, Literal import httpx from fastmcp import Context, FastMCP @@ -235,7 +235,7 @@ def smartscraper_status(self, request_id: str) -> Dict[str, Any]: return response.json() - def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scrolls: int = None) -> Dict[str, Any]: + def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scrolls: int = None, time_range: str = None) -> Dict[str, Any]: """ Perform AI-powered web searches with structured results. @@ -243,6 +243,7 @@ def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scr user_prompt: Search query or instructions num_results: Number of websites to search (optional, default: 3 websites = 30 credits) number_of_scrolls: Number of infinite scrolls to perform on each website (optional) + time_range: Filter results by time range (optional). Valid values: past_hour, past_24_hours, past_week, past_month, past_year Returns: Dictionary containing search results and reference URLs @@ -251,15 +252,19 @@ def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scr data = { "user_prompt": user_prompt } - + # Add num_results to the request if provided if num_results is not None: data["num_results"] = num_results - + # Add number_of_scrolls to the request if provided if number_of_scrolls is not None: data["number_of_scrolls"] = number_of_scrolls + # Add time_range to the request if provided + if time_range is not None: + data["time_range"] = time_range + response = self.client.post(url, headers=self.headers, json=data) if response.status_code != 200: @@ -1829,7 +1834,8 @@ def searchscraper( user_prompt: str, ctx: Context, num_results: Optional[int] = None, - number_of_scrolls: Optional[int] = None + number_of_scrolls: Optional[int] = None, + time_range: Optional[Literal["past_hour", "past_24_hours", "past_week", "past_month", "past_year"]] = None ) -> Dict[str, Any]: """ Perform AI-powered web searches with structured data extraction. @@ -1877,6 +1883,22 @@ def searchscraper( * 5: Extensive feeds, long-form content with infinite scroll - Note: Increases processing time significantly (adds 5-10 seconds per scroll per page) + time_range (Optional[str]): Filter search results by time range. + - Default: None (no time filter applied) + - Valid values: + * "past_hour": Results from the last hour + * "past_24_hours": Results from the last 24 hours + * "past_week": Results from the last 7 days + * "past_month": Results from the last 30 days + * "past_year": Results from the last 365 days + - Examples: + * Use "past_hour" for breaking news or real-time updates + * Use "past_24_hours" for recent developments + * Use "past_week" for current events and trending topics + * Use "past_month" for recent but not immediate information + * Use "past_year" for relatively recent content + - Note: Useful for finding recent information or filtering out outdated content + Returns: Dictionary containing: - search_results: Array of extracted data from each website found @@ -1903,7 +1925,7 @@ def searchscraper( try: api_key = get_api_key(ctx) client = ScapeGraphClient(api_key) - return client.searchscraper(user_prompt, num_results, number_of_scrolls) + return client.searchscraper(user_prompt, num_results, number_of_scrolls, time_range) except Exception as e: return {"error": str(e)} From 7d5e44455b9369a4332728568fe2723d9c89b62c Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 28 Jan 2026 08:48:48 +0100 Subject: [PATCH 3/4] docs: update documentation for time_range parameter in searchscraper - Add time_range parameter to searchscraper documentation - Document supported values: past_hour, past_24_hours, past_week, past_month, past_year - Update MCP tools reference table - Add example usage with time_range - Update changelog to reflect SDK changes (scrapegraph-py#77, scrapegraph-js#2) Co-Authored-By: Claude Opus 4.5 --- .agent/README.md | 7 ++++++- .agent/system/project_architecture.md | 8 +++++--- README.md | 5 ++++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.agent/README.md b/.agent/README.md index a72e962..ac61a57 100644 --- a/.agent/README.md +++ b/.agent/README.md @@ -178,7 +178,7 @@ Quick reference to all MCP tools: |------|------------|---------|---------|-------| | `markdownify` | `website_url` | Convert webpage to markdown | 2 | No | | `smartscraper` | `user_prompt`, `website_url`, `number_of_scrolls?`, `markdown_only?` | AI-powered data extraction | 10+ | No | -| `searchscraper` | `user_prompt`, `num_results?`, `number_of_scrolls?` | AI-powered web search | Variable | No | +| `searchscraper` | `user_prompt`, `num_results?`, `number_of_scrolls?`, `time_range?` | AI-powered web search | Variable | No | | `smartcrawler_initiate` | `url`, `prompt?`, `extraction_mode`, `depth?`, `max_pages?`, `same_domain_only?` | Start multi-page crawl | 100+ | Yes (returns request_id) | | `smartcrawler_fetch_results` | `request_id` | Get crawl results | N/A | No (polls status) | @@ -376,6 +376,11 @@ npx @modelcontextprotocol/inspector scrapegraph-mcp ## 📅 Changelog +### January 2026 +- ✅ Added `time_range` parameter to SearchScraper for filtering results by recency +- ✅ Supported time ranges: `past_hour`, `past_24_hours`, `past_week`, `past_month`, `past_year` +- ✅ Documentation updated to reflect SDK changes (scrapegraph-py#77, scrapegraph-js#2) + ### October 2025 - ✅ Initial comprehensive documentation created - ✅ Project architecture fully documented diff --git a/.agent/system/project_architecture.md b/.agent/system/project_architecture.md index 3133abd..ea1fb1d 100644 --- a/.agent/system/project_architecture.md +++ b/.agent/system/project_architecture.md @@ -1,6 +1,6 @@ # ScrapeGraph MCP Server - Project Architecture -**Last Updated:** October 2025 +**Last Updated:** January 2026 **Version:** 1.0.0 ## Table of Contents @@ -255,7 +255,7 @@ The server exposes 5 tools to AI assistants: --- -### 3. `searchscraper(user_prompt: str, num_results: int = None, number_of_scrolls: int = None)` +### 3. `searchscraper(user_prompt: str, num_results: int = None, number_of_scrolls: int = None, time_range: str = None)` **Purpose:** Perform AI-powered web searches with structured results @@ -263,6 +263,7 @@ The server exposes 5 tools to AI assistants: - `user_prompt` (str) - Search query or instructions - `num_results` (int, optional) - Number of websites to search (default: 3 = 30 credits) - `number_of_scrolls` (int, optional) - Number of infinite scrolls per website +- `time_range` (str, optional) - Filter results by recency. Valid values: `past_hour`, `past_24_hours`, `past_week`, `past_month`, `past_year` **Returns:** ```json @@ -282,7 +283,8 @@ The server exposes 5 tools to AI assistants: "Research the latest AI developments in 2025" → AI calls: searchscraper( user_prompt="Latest AI developments in 2025", - num_results=5 + num_results=5, + time_range="past_week" ) ``` diff --git a/README.md b/README.md index c425852..5efefb1 100644 --- a/README.md +++ b/README.md @@ -96,11 +96,13 @@ Execute AI-powered web searches with structured, actionable results. searchscraper( user_prompt: str, num_results: int = None, - number_of_scrolls: int = None + number_of_scrolls: int = None, + time_range: str = None # Filter by: past_hour, past_24_hours, past_week, past_month, past_year ) ``` - **Credits**: Variable (3-20 websites × 10 credits) - **Use case**: Multi-source research and data aggregation +- **Time filtering**: Use `time_range` to filter results by recency (e.g., `"past_week"` for recent results) ### Advanced Scraping Tools @@ -518,6 +520,7 @@ The server enables sophisticated queries across various scraping scenarios: - **SearchScraper**: "Research and summarize recent developments in AI-powered web scraping" - **SearchScraper**: "Search for the top 5 articles about machine learning frameworks and extract key insights" - **SearchScraper**: "Find recent news about GPT-4 and provide a structured summary" +- **SearchScraper with time_range**: "Search for AI news from the past week only" (uses `time_range="past_week"`) ### Website Analysis - **Sitemap**: "Extract the complete sitemap structure from the ScrapeGraph website" From 81f2773b9eee3e290d5a5dd4541e31db7e954034 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 28 Jan 2026 12:26:03 +0100 Subject: [PATCH 4/4] refactor: remove steps parameter from markdownify Addresses PR review feedback to remove the steps parameter from the markdownify function as it's not needed for this endpoint. Co-Authored-By: Claude Opus 4.5 --- src/scrapegraph_mcp/server.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/scrapegraph_mcp/server.py b/src/scrapegraph_mcp/server.py index 857a660..2d881d2 100644 --- a/src/scrapegraph_mcp/server.py +++ b/src/scrapegraph_mcp/server.py @@ -94,7 +94,6 @@ def markdownify( self, website_url: str, headers: Optional[Dict[str, str]] = None, - steps: Optional[List[str]] = None, stealth: Optional[bool] = None, stream: Optional[bool] = None, mock: Optional[bool] = None @@ -105,7 +104,6 @@ def markdownify( Args: website_url: URL of the webpage to convert headers: HTTP headers to include in the request (optional) - steps: Interaction steps before conversion (optional) stealth: Enable stealth mode to avoid bot detection (optional) stream: Enable streaming response for real-time updates (optional) mock: Return mock data for testing purposes (optional) @@ -118,8 +116,6 @@ def markdownify( if headers is not None: data["headers"] = headers - if steps is not None: - data["steps"] = steps if stealth is not None: data["stealth"] = stealth if stream is not None: @@ -1441,7 +1437,6 @@ def markdownify( website_url: str, ctx: Context, headers: Optional[Dict[str, str]] = None, - steps: Optional[List[str]] = None, stealth: Optional[bool] = None, stream: Optional[bool] = None, mock: Optional[bool] = None @@ -1476,15 +1471,6 @@ def markdownify( * {"Accept-Language": "en-US"} - Default: None (uses standard headers) - steps (Optional[List[str]]): Website interaction steps before conversion. - - List of actions to perform before extracting markdown content - - Useful for interacting with dynamic pages or navigating through content - - Examples: - * ["Click 'Show More' button", "Wait for content to load"] - * ["Navigate to About section", "Expand all details"] - * ["Accept cookies", "Close popup"] - - Default: None (no interaction steps) - stealth (Optional[bool]): Enable stealth mode to avoid bot detection. - Default: false (standard request) - Set to true to bypass basic anti-scraping measures @@ -1528,7 +1514,6 @@ def markdownify( return client.markdownify( website_url=website_url, headers=headers, - steps=steps, stealth=stealth, stream=stream, mock=mock