From d82e92fd0713798db0b21afad7df52f70eca02f9 Mon Sep 17 00:00:00 2001 From: codegen-bot Date: Sun, 16 Mar 2025 14:02:21 +0000 Subject: [PATCH 1/2] CG-12194: Add Web Browsing Tool similar to Linear ViewIssue Tool --- src/codegen/extensions/langchain/tools.py | 66 +++++- src/codegen/extensions/langchain/web_tools.py | 81 ++++++++ src/codegen/extensions/web/__init__.py | 15 ++ src/codegen/extensions/web/types.py | 29 +++ src/codegen/extensions/web/web.py | 191 ++++++++++++++++++ src/codegen/extensions/web/web_client.py | 179 ++++++++++++++++ 6 files changed, 560 insertions(+), 1 deletion(-) create mode 100644 src/codegen/extensions/langchain/web_tools.py create mode 100644 src/codegen/extensions/web/__init__.py create mode 100644 src/codegen/extensions/web/types.py create mode 100644 src/codegen/extensions/web/web.py create mode 100644 src/codegen/extensions/web/web_client.py diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index 877b59f05..cc9e21f5f 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -25,6 +25,12 @@ from codegen.extensions.tools.search import search from codegen.extensions.tools.semantic_edit import semantic_edit from codegen.extensions.tools.semantic_search import semantic_search +from codegen.extensions.web.web_client import WebClient +from codegen.extensions.web.web import ( + web_browse_page_tool, + web_search_tool, + web_extract_images_tool, +) from codegen.sdk.core.codebase import Codebase from ..tools import ( @@ -43,6 +49,7 @@ ) from ..tools.relace_edit_prompts import RELACE_EDIT_PROMPT from ..tools.semantic_edit_prompts import FILE_EDIT_PROMPT +from .web_tools import WebBrowsePageTool, WebSearchTool, WebExtractImagesTool class ViewFileInput(BaseModel): @@ -823,6 +830,59 @@ def _run(self, content: str) -> str: return "✅ Message sent successfully" +######################################################################################################################## +# WEB +######################################################################################################################## + + +class WebBrowsePageTool(BaseTool): + """Tool for browsing a web page.""" + + name: ClassVar[str] = "web_browse_page" + description: ClassVar[str] = "Browse a web page and extract relevant information" + args_schema: ClassVar[type[BaseModel]] = WebBrowsePageTool.args_schema + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, url: str) -> str: + result = web_browse_page_tool(self.client, url) + return result.render() + + +class WebSearchTool(BaseTool): + """Tool for searching the web.""" + + name: ClassVar[str] = "web_search" + description: ClassVar[str] = "Search the web for information" + args_schema: ClassVar[type[BaseModel]] = WebSearchTool.args_schema + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, query: str) -> str: + result = web_search_tool(self.client, query) + return result.render() + + +class WebExtractImagesTool(BaseTool): + """Tool for extracting images from a web page.""" + + name: ClassVar[str] = "web_extract_images" + description: ClassVar[str] = "Extract images from a web page" + args_schema: ClassVar[type[BaseModel]] = WebExtractImagesTool.args_schema + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, url: str) -> str: + result = web_extract_images_tool(self.client, url) + return result.render() + + ######################################################################################################################## # EXPORT ######################################################################################################################## @@ -868,6 +928,10 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]: LinearSearchIssuesTool(codebase), LinearCreateIssueTool(codebase), LinearGetTeamsTool(codebase), + # Web + WebBrowsePageTool(WebClient()), + WebSearchTool(WebClient()), + WebExtractImagesTool(WebClient()), ] @@ -1023,4 +1087,4 @@ def _run( ) -> str: result = perform_reflection(context_summary=context_summary, findings_so_far=findings_so_far, current_challenges=current_challenges, reflection_focus=reflection_focus, codebase=self.codebase) - return result.render() + return result.render() \ No newline at end of file diff --git a/src/codegen/extensions/langchain/web_tools.py b/src/codegen/extensions/langchain/web_tools.py new file mode 100644 index 000000000..1cd54681c --- /dev/null +++ b/src/codegen/extensions/langchain/web_tools.py @@ -0,0 +1,81 @@ +"""LangChain tools for web browsing.""" + +from typing import ClassVar, Optional + +from langchain_core.tools.base import BaseTool +from pydantic import BaseModel, Field + +from codegen.extensions.web.web_client import WebClient +from codegen.extensions.web.web import ( + web_browse_page_tool, + web_search_tool, + web_extract_images_tool, +) + + +class WebBrowsePageInput(BaseModel): + """Input for browsing a web page.""" + + url: str = Field(..., description="URL of the web page to browse") + + +class WebBrowsePageTool(BaseTool): + """Tool for browsing web pages.""" + + name: ClassVar[str] = "web_browse_page" + description: ClassVar[str] = "Browse a web page and extract its content" + args_schema: ClassVar[type[BaseModel]] = WebBrowsePageInput + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, url: str) -> str: + result = web_browse_page_tool(self.client, url) + return result.render() + + +class WebSearchInput(BaseModel): + """Input for web search.""" + + query: str = Field(..., description="Search query string") + num_results: int = Field(default=10, description="Maximum number of results to return") + + +class WebSearchTool(BaseTool): + """Tool for searching the web.""" + + name: ClassVar[str] = "web_search" + description: ClassVar[str] = "Search the web using a search engine" + args_schema: ClassVar[type[BaseModel]] = WebSearchInput + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, query: str, num_results: int = 10) -> str: + result = web_search_tool(self.client, query, num_results) + return result.render() + + +class WebExtractImagesInput(BaseModel): + """Input for extracting images from a web page.""" + + url: str = Field(..., description="URL of the web page") + max_images: int = Field(default=20, description="Maximum number of images to extract") + + +class WebExtractImagesTool(BaseTool): + """Tool for extracting images from web pages.""" + + name: ClassVar[str] = "web_extract_images" + description: ClassVar[str] = "Extract images from a web page" + args_schema: ClassVar[type[BaseModel]] = WebExtractImagesInput + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, url: str, max_images: int = 20) -> str: + result = web_extract_images_tool(self.client, url, max_images) + return result.render() \ No newline at end of file diff --git a/src/codegen/extensions/web/__init__.py b/src/codegen/extensions/web/__init__.py new file mode 100644 index 000000000..ec4b398df --- /dev/null +++ b/src/codegen/extensions/web/__init__.py @@ -0,0 +1,15 @@ +"""Web browsing functionality.""" + +from codegen.extensions.web.web_client import WebClient +from codegen.extensions.web.web import ( + web_browse_page_tool, + web_search_tool, + web_extract_images_tool, +) + +__all__ = [ + "WebClient", + "web_browse_page_tool", + "web_search_tool", + "web_extract_images_tool", +] \ No newline at end of file diff --git a/src/codegen/extensions/web/types.py b/src/codegen/extensions/web/types.py new file mode 100644 index 000000000..f1707ee37 --- /dev/null +++ b/src/codegen/extensions/web/types.py @@ -0,0 +1,29 @@ +"""Types for web browsing functionality.""" + +from pydantic import BaseModel, Field + + +class WebPage(BaseModel): + """Represents a web page.""" + + url: str = Field(description="URL of the web page") + title: str = Field(description="Title of the web page") + content: str = Field(description="Main content of the web page") + status_code: int = Field(description="HTTP status code of the response") + + +class WebSearchResult(BaseModel): + """Represents a single search result.""" + + title: str = Field(description="Title of the search result") + url: str = Field(description="URL of the search result") + snippet: str = Field(description="Snippet or description of the search result") + + +class WebImage(BaseModel): + """Represents an image from a web page.""" + + url: str = Field(description="URL of the image") + alt_text: str | None = Field(None, description="Alternative text for the image") + width: int | None = Field(None, description="Width of the image in pixels") + height: int | None = Field(None, description="Height of the image in pixels") \ No newline at end of file diff --git a/src/codegen/extensions/web/web.py b/src/codegen/extensions/web/web.py new file mode 100644 index 000000000..38ffa750e --- /dev/null +++ b/src/codegen/extensions/web/web.py @@ -0,0 +1,191 @@ +"""Tools for web browsing functionality.""" + +from typing import ClassVar, List + +import requests +from pydantic import Field + +from codegen.extensions.web.web_client import WebClient +from codegen.extensions.tools.observation import Observation + + +class WebBrowsePageObservation(Observation): + """Response from browsing a web page.""" + + url: str = Field(description="URL of the browsed page") + title: str = Field(description="Title of the web page") + content: str = Field(description="Content of the web page") + status_code: int = Field(description="HTTP status code") + + str_template: ClassVar[str] = "Browsed page: {title} ({url})" + + +class WebSearchObservation(Observation): + """Response from web search.""" + + query: str = Field(description="Search query used") + results: List[dict] = Field(description="List of search results") + + str_template: ClassVar[str] = "Found {result_count} results for '{query}'" + + def _get_details(self) -> dict[str, str | int]: + """Get details for string representation.""" + return { + "result_count": len(self.results), + "query": self.query, + } + + +class WebExtractImagesObservation(Observation): + """Response from extracting images from a web page.""" + + url: str = Field(description="URL of the web page") + images: List[dict] = Field(description="List of extracted images") + + str_template: ClassVar[str] = "Extracted {image_count} images from {url}" + + def _get_details(self) -> dict[str, str | int]: + """Get details for string representation.""" + return { + "image_count": len(self.images), + "url": self.url, + } + + +def web_browse_page_tool(client: WebClient, url: str) -> WebBrowsePageObservation: + """Browse a web page and extract its content. + + Args: + client: WebClient instance + url: URL of the web page to browse + + Returns: + WebBrowsePageObservation with the page content + """ + try: + page = client.browse_page(url) + return WebBrowsePageObservation( + status="success", + url=page.url, + title=page.title, + content=page.content, + status_code=page.status_code, + ) + except requests.exceptions.RequestException as e: + # Network-related errors + return WebBrowsePageObservation( + status="error", + error=f"Network error when browsing page: {e!s}", + url=url, + title="Error", + content="", + status_code=0, + ) + except ValueError as e: + # Input validation errors + return WebBrowsePageObservation( + status="error", + error=f"Invalid input: {e!s}", + url=url, + title="Error", + content="", + status_code=0, + ) + except Exception as e: + # Catch-all for other errors + return WebBrowsePageObservation( + status="error", + error=f"Failed to browse page: {e!s}", + url=url, + title="Error", + content="", + status_code=0, + ) + + +def web_search_tool(client: WebClient, query: str, num_results: int = 10) -> WebSearchObservation: + """Search the web using a search engine. + + Args: + client: WebClient instance + query: Search query string + num_results: Maximum number of results to return + + Returns: + WebSearchObservation with search results + """ + try: + results = client.search(query, num_results) + return WebSearchObservation( + status="success", + query=query, + results=[result.dict() for result in results], + ) + except requests.exceptions.RequestException as e: + # Network-related errors + return WebSearchObservation( + status="error", + error=f"Network error when searching: {e!s}", + query=query, + results=[], + ) + except ValueError as e: + # Input validation errors + return WebSearchObservation( + status="error", + error=f"Invalid input: {e!s}", + query=query, + results=[], + ) + except Exception as e: + # Catch-all for other errors + return WebSearchObservation( + status="error", + error=f"Failed to search: {e!s}", + query=query, + results=[], + ) + + +def web_extract_images_tool(client: WebClient, url: str, max_images: int = 20) -> WebExtractImagesObservation: + """Extract images from a web page. + + Args: + client: WebClient instance + url: URL of the web page + max_images: Maximum number of images to extract + + Returns: + WebExtractImagesObservation with extracted images + """ + try: + images = client.extract_images(url, max_images) + return WebExtractImagesObservation( + status="success", + url=url, + images=[image.dict() for image in images], + ) + except requests.exceptions.RequestException as e: + # Network-related errors + return WebExtractImagesObservation( + status="error", + error=f"Network error when extracting images: {e!s}", + url=url, + images=[], + ) + except ValueError as e: + # Input validation errors + return WebExtractImagesObservation( + status="error", + error=f"Invalid input: {e!s}", + url=url, + images=[], + ) + except Exception as e: + # Catch-all for other errors + return WebExtractImagesObservation( + status="error", + error=f"Failed to extract images: {e!s}", + url=url, + images=[], + ) \ No newline at end of file diff --git a/src/codegen/extensions/web/web_client.py b/src/codegen/extensions/web/web_client.py new file mode 100644 index 000000000..58c94e955 --- /dev/null +++ b/src/codegen/extensions/web/web_client.py @@ -0,0 +1,179 @@ +"""Client for web browsing functionality.""" + +import os +from typing import List, Optional +import requests +from bs4 import BeautifulSoup +from urllib.parse import urlparse, urljoin + +from codegen.extensions.web.types import WebPage, WebSearchResult, WebImage +from codegen.shared.logging.get_logger import get_logger + +logger = get_logger(__name__) + + +class WebClient: + """Client for web browsing functionality.""" + + def __init__(self, user_agent: Optional[str] = None, max_retries: int = 3, timeout: int = 10): + """Initialize the web client. + + Args: + user_agent: Custom user agent string. If None, a default one will be used. + max_retries: Maximum number of retries for failed requests. + timeout: Timeout in seconds for requests. + """ + self.timeout = timeout + + # Set up a session with retry logic + self.session = requests.Session() + + # Configure retries + adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) + self.session.mount('http://', adapter) + self.session.mount('https://', adapter) + + # Set user agent + if not user_agent: + user_agent = os.getenv("WEB_USER_AGENT", "Codegen Web Browser Tool/1.0") + self.session.headers.update({"User-Agent": user_agent}) + + def browse_page(self, url: str) -> WebPage: + """Browse a web page and extract its content. + + Args: + url: URL of the web page to browse. + + Returns: + WebPage object containing the page content. + + Raises: + ValueError: If the URL is invalid or the page cannot be accessed. + """ + try: + # Validate URL + parsed_url = urlparse(url) + if not parsed_url.scheme or not parsed_url.netloc: + raise ValueError(f"Invalid URL: {url}") + + # Make the request + response = self.session.get(url, timeout=self.timeout) + response.raise_for_status() + + # Parse the HTML + soup = BeautifulSoup(response.text, 'html.parser') + + # Extract title + title = soup.title.string if soup.title else "No title" + + # Extract main content (simplified approach) + # Remove script and style elements + for script in soup(["script", "style"]): + script.extract() + + # Get text content + content = soup.get_text(separator='\n', strip=True) + + # Truncate content if too long (100k chars max) + if len(content) > 100000: + content = content[:100000] + "... [content truncated]" + + return WebPage( + url=url, + title=title, + content=content, + status_code=response.status_code + ) + + except requests.exceptions.RequestException as e: + logger.error(f"Error browsing page {url}: {str(e)}") + raise ValueError(f"Failed to access URL: {str(e)}") + except Exception as e: + logger.error(f"Error processing page {url}: {str(e)}") + raise ValueError(f"Error processing page: {str(e)}") + + def search(self, query: str, num_results: int = 10) -> List[WebSearchResult]: + """Search the web using a search engine API. + + Note: This is a placeholder. In a real implementation, you would integrate + with a search engine API like Google Custom Search, Bing, or DuckDuckGo. + + Args: + query: Search query string. + num_results: Maximum number of results to return. + + Returns: + List of WebSearchResult objects. + + Raises: + ValueError: If the search fails. + """ + # This is a placeholder. In a real implementation, you would: + # 1. Call a search engine API + # 2. Parse the results + # 3. Return them as WebSearchResult objects + + # For now, return a message explaining this is a placeholder + placeholder = WebSearchResult( + title="Search Functionality Placeholder", + url="https://example.com/search", + snippet="This is a placeholder for search functionality. In a real implementation, " + "this would integrate with a search engine API like Google Custom Search, " + "Bing, or DuckDuckGo." + ) + + return [placeholder] + + def extract_images(self, url: str, max_images: int = 20) -> List[WebImage]: + """Extract images from a web page. + + Args: + url: URL of the web page. + max_images: Maximum number of images to extract. + + Returns: + List of WebImage objects. + + Raises: + ValueError: If the URL is invalid or the page cannot be accessed. + """ + try: + # Make the request + response = self.session.get(url, timeout=self.timeout) + response.raise_for_status() + + # Parse the HTML + soup = BeautifulSoup(response.text, 'html.parser') + + # Find all image tags + img_tags = soup.find_all('img', limit=max_images) + + images = [] + for img in img_tags: + # Get image URL (handle relative URLs) + img_url = img.get('src', '') + if img_url: + img_url = urljoin(url, img_url) + else: + continue + + # Get alt text, width, and height + alt_text = img.get('alt', None) + width = int(img.get('width', 0)) or None + height = int(img.get('height', 0)) or None + + images.append(WebImage( + url=img_url, + alt_text=alt_text, + width=width, + height=height + )) + + return images + + except requests.exceptions.RequestException as e: + logger.error(f"Error accessing page {url}: {str(e)}") + raise ValueError(f"Failed to access URL: {str(e)}") + except Exception as e: + logger.error(f"Error extracting images from {url}: {str(e)}") + raise ValueError(f"Error extracting images: {str(e)}") \ No newline at end of file From dee8c9a795a62b270b3ab0dc4049a9a954258fa4 Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Sun, 16 Mar 2025 14:03:05 +0000 Subject: [PATCH 2/2] Automated pre-commit update --- src/codegen/extensions/langchain/tools.py | 8 +- src/codegen/extensions/langchain/web_tools.py | 8 +- src/codegen/extensions/web/__init__.py | 8 +- src/codegen/extensions/web/types.py | 8 +- src/codegen/extensions/web/web.py | 22 +-- src/codegen/extensions/web/web_client.py | 134 +++++++++--------- 6 files changed, 91 insertions(+), 97 deletions(-) diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index cc9e21f5f..64a993fb1 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -25,12 +25,12 @@ from codegen.extensions.tools.search import search from codegen.extensions.tools.semantic_edit import semantic_edit from codegen.extensions.tools.semantic_search import semantic_search -from codegen.extensions.web.web_client import WebClient from codegen.extensions.web.web import ( web_browse_page_tool, - web_search_tool, web_extract_images_tool, + web_search_tool, ) +from codegen.extensions.web.web_client import WebClient from codegen.sdk.core.codebase import Codebase from ..tools import ( @@ -49,7 +49,7 @@ ) from ..tools.relace_edit_prompts import RELACE_EDIT_PROMPT from ..tools.semantic_edit_prompts import FILE_EDIT_PROMPT -from .web_tools import WebBrowsePageTool, WebSearchTool, WebExtractImagesTool +from .web_tools import WebBrowsePageTool, WebExtractImagesTool, WebSearchTool class ViewFileInput(BaseModel): @@ -1087,4 +1087,4 @@ def _run( ) -> str: result = perform_reflection(context_summary=context_summary, findings_so_far=findings_so_far, current_challenges=current_challenges, reflection_focus=reflection_focus, codebase=self.codebase) - return result.render() \ No newline at end of file + return result.render() diff --git a/src/codegen/extensions/langchain/web_tools.py b/src/codegen/extensions/langchain/web_tools.py index 1cd54681c..0bf259d22 100644 --- a/src/codegen/extensions/langchain/web_tools.py +++ b/src/codegen/extensions/langchain/web_tools.py @@ -1,16 +1,16 @@ """LangChain tools for web browsing.""" -from typing import ClassVar, Optional +from typing import ClassVar from langchain_core.tools.base import BaseTool from pydantic import BaseModel, Field -from codegen.extensions.web.web_client import WebClient from codegen.extensions.web.web import ( web_browse_page_tool, - web_search_tool, web_extract_images_tool, + web_search_tool, ) +from codegen.extensions.web.web_client import WebClient class WebBrowsePageInput(BaseModel): @@ -78,4 +78,4 @@ def __init__(self, client: WebClient) -> None: def _run(self, url: str, max_images: int = 20) -> str: result = web_extract_images_tool(self.client, url, max_images) - return result.render() \ No newline at end of file + return result.render() diff --git a/src/codegen/extensions/web/__init__.py b/src/codegen/extensions/web/__init__.py index ec4b398df..8e1d9363b 100644 --- a/src/codegen/extensions/web/__init__.py +++ b/src/codegen/extensions/web/__init__.py @@ -1,15 +1,15 @@ """Web browsing functionality.""" -from codegen.extensions.web.web_client import WebClient from codegen.extensions.web.web import ( web_browse_page_tool, - web_search_tool, web_extract_images_tool, + web_search_tool, ) +from codegen.extensions.web.web_client import WebClient __all__ = [ "WebClient", "web_browse_page_tool", - "web_search_tool", "web_extract_images_tool", -] \ No newline at end of file + "web_search_tool", +] diff --git a/src/codegen/extensions/web/types.py b/src/codegen/extensions/web/types.py index f1707ee37..4bbb5b785 100644 --- a/src/codegen/extensions/web/types.py +++ b/src/codegen/extensions/web/types.py @@ -5,7 +5,7 @@ class WebPage(BaseModel): """Represents a web page.""" - + url: str = Field(description="URL of the web page") title: str = Field(description="Title of the web page") content: str = Field(description="Main content of the web page") @@ -14,7 +14,7 @@ class WebPage(BaseModel): class WebSearchResult(BaseModel): """Represents a single search result.""" - + title: str = Field(description="Title of the search result") url: str = Field(description="URL of the search result") snippet: str = Field(description="Snippet or description of the search result") @@ -22,8 +22,8 @@ class WebSearchResult(BaseModel): class WebImage(BaseModel): """Represents an image from a web page.""" - + url: str = Field(description="URL of the image") alt_text: str | None = Field(None, description="Alternative text for the image") width: int | None = Field(None, description="Width of the image in pixels") - height: int | None = Field(None, description="Height of the image in pixels") \ No newline at end of file + height: int | None = Field(None, description="Height of the image in pixels") diff --git a/src/codegen/extensions/web/web.py b/src/codegen/extensions/web/web.py index 38ffa750e..0120f7f49 100644 --- a/src/codegen/extensions/web/web.py +++ b/src/codegen/extensions/web/web.py @@ -1,12 +1,12 @@ """Tools for web browsing functionality.""" -from typing import ClassVar, List +from typing import ClassVar import requests from pydantic import Field -from codegen.extensions.web.web_client import WebClient from codegen.extensions.tools.observation import Observation +from codegen.extensions.web.web_client import WebClient class WebBrowsePageObservation(Observation): @@ -24,7 +24,7 @@ class WebSearchObservation(Observation): """Response from web search.""" query: str = Field(description="Search query used") - results: List[dict] = Field(description="List of search results") + results: list[dict] = Field(description="List of search results") str_template: ClassVar[str] = "Found {result_count} results for '{query}'" @@ -40,7 +40,7 @@ class WebExtractImagesObservation(Observation): """Response from extracting images from a web page.""" url: str = Field(description="URL of the web page") - images: List[dict] = Field(description="List of extracted images") + images: list[dict] = Field(description="List of extracted images") str_template: ClassVar[str] = "Extracted {image_count} images from {url}" @@ -54,11 +54,11 @@ def _get_details(self) -> dict[str, str | int]: def web_browse_page_tool(client: WebClient, url: str) -> WebBrowsePageObservation: """Browse a web page and extract its content. - + Args: client: WebClient instance url: URL of the web page to browse - + Returns: WebBrowsePageObservation with the page content """ @@ -105,12 +105,12 @@ def web_browse_page_tool(client: WebClient, url: str) -> WebBrowsePageObservatio def web_search_tool(client: WebClient, query: str, num_results: int = 10) -> WebSearchObservation: """Search the web using a search engine. - + Args: client: WebClient instance query: Search query string num_results: Maximum number of results to return - + Returns: WebSearchObservation with search results """ @@ -149,12 +149,12 @@ def web_search_tool(client: WebClient, query: str, num_results: int = 10) -> Web def web_extract_images_tool(client: WebClient, url: str, max_images: int = 20) -> WebExtractImagesObservation: """Extract images from a web page. - + Args: client: WebClient instance url: URL of the web page max_images: Maximum number of images to extract - + Returns: WebExtractImagesObservation with extracted images """ @@ -188,4 +188,4 @@ def web_extract_images_tool(client: WebClient, url: str, max_images: int = 20) - error=f"Failed to extract images: {e!s}", url=url, images=[], - ) \ No newline at end of file + ) diff --git a/src/codegen/extensions/web/web_client.py b/src/codegen/extensions/web/web_client.py index 58c94e955..2be61c580 100644 --- a/src/codegen/extensions/web/web_client.py +++ b/src/codegen/extensions/web/web_client.py @@ -1,12 +1,13 @@ """Client for web browsing functionality.""" import os -from typing import List, Optional +from typing import Optional +from urllib.parse import urljoin, urlparse + import requests from bs4 import BeautifulSoup -from urllib.parse import urlparse, urljoin -from codegen.extensions.web.types import WebPage, WebSearchResult, WebImage +from codegen.extensions.web.types import WebImage, WebPage, WebSearchResult from codegen.shared.logging.get_logger import get_logger logger = get_logger(__name__) @@ -17,22 +18,22 @@ class WebClient: def __init__(self, user_agent: Optional[str] = None, max_retries: int = 3, timeout: int = 10): """Initialize the web client. - + Args: user_agent: Custom user agent string. If None, a default one will be used. max_retries: Maximum number of retries for failed requests. timeout: Timeout in seconds for requests. """ self.timeout = timeout - + # Set up a session with retry logic self.session = requests.Session() - + # Configure retries adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) - self.session.mount('http://', adapter) - self.session.mount('https://', adapter) - + self.session.mount("http://", adapter) + self.session.mount("https://", adapter) + # Set user agent if not user_agent: user_agent = os.getenv("WEB_USER_AGENT", "Codegen Web Browser Tool/1.0") @@ -40,13 +41,13 @@ def __init__(self, user_agent: Optional[str] = None, max_retries: int = 3, timeo def browse_page(self, url: str) -> WebPage: """Browse a web page and extract its content. - + Args: url: URL of the web page to browse. - + Returns: WebPage object containing the page content. - + Raises: ValueError: If the URL is invalid or the page cannot be accessed. """ @@ -54,57 +55,55 @@ def browse_page(self, url: str) -> WebPage: # Validate URL parsed_url = urlparse(url) if not parsed_url.scheme or not parsed_url.netloc: - raise ValueError(f"Invalid URL: {url}") - + msg = f"Invalid URL: {url}" + raise ValueError(msg) + # Make the request response = self.session.get(url, timeout=self.timeout) response.raise_for_status() - + # Parse the HTML - soup = BeautifulSoup(response.text, 'html.parser') - + soup = BeautifulSoup(response.text, "html.parser") + # Extract title title = soup.title.string if soup.title else "No title" - + # Extract main content (simplified approach) # Remove script and style elements for script in soup(["script", "style"]): script.extract() - + # Get text content - content = soup.get_text(separator='\n', strip=True) - + content = soup.get_text(separator="\n", strip=True) + # Truncate content if too long (100k chars max) if len(content) > 100000: content = content[:100000] + "... [content truncated]" - - return WebPage( - url=url, - title=title, - content=content, - status_code=response.status_code - ) - + + return WebPage(url=url, title=title, content=content, status_code=response.status_code) + except requests.exceptions.RequestException as e: - logger.error(f"Error browsing page {url}: {str(e)}") - raise ValueError(f"Failed to access URL: {str(e)}") + logger.exception(f"Error browsing page {url}: {e!s}") + msg = f"Failed to access URL: {e!s}" + raise ValueError(msg) except Exception as e: - logger.error(f"Error processing page {url}: {str(e)}") - raise ValueError(f"Error processing page: {str(e)}") + logger.exception(f"Error processing page {url}: {e!s}") + msg = f"Error processing page: {e!s}" + raise ValueError(msg) - def search(self, query: str, num_results: int = 10) -> List[WebSearchResult]: + def search(self, query: str, num_results: int = 10) -> list[WebSearchResult]: """Search the web using a search engine API. - + Note: This is a placeholder. In a real implementation, you would integrate with a search engine API like Google Custom Search, Bing, or DuckDuckGo. - + Args: query: Search query string. num_results: Maximum number of results to return. - + Returns: List of WebSearchResult objects. - + Raises: ValueError: If the search fails. """ @@ -112,28 +111,26 @@ def search(self, query: str, num_results: int = 10) -> List[WebSearchResult]: # 1. Call a search engine API # 2. Parse the results # 3. Return them as WebSearchResult objects - + # For now, return a message explaining this is a placeholder placeholder = WebSearchResult( title="Search Functionality Placeholder", url="https://example.com/search", - snippet="This is a placeholder for search functionality. In a real implementation, " - "this would integrate with a search engine API like Google Custom Search, " - "Bing, or DuckDuckGo." + snippet="This is a placeholder for search functionality. In a real implementation, this would integrate with a search engine API like Google Custom Search, Bing, or DuckDuckGo.", ) - + return [placeholder] - def extract_images(self, url: str, max_images: int = 20) -> List[WebImage]: + def extract_images(self, url: str, max_images: int = 20) -> list[WebImage]: """Extract images from a web page. - + Args: url: URL of the web page. max_images: Maximum number of images to extract. - + Returns: List of WebImage objects. - + Raises: ValueError: If the URL is invalid or the page cannot be accessed. """ @@ -141,39 +138,36 @@ def extract_images(self, url: str, max_images: int = 20) -> List[WebImage]: # Make the request response = self.session.get(url, timeout=self.timeout) response.raise_for_status() - + # Parse the HTML - soup = BeautifulSoup(response.text, 'html.parser') - + soup = BeautifulSoup(response.text, "html.parser") + # Find all image tags - img_tags = soup.find_all('img', limit=max_images) - + img_tags = soup.find_all("img", limit=max_images) + images = [] for img in img_tags: # Get image URL (handle relative URLs) - img_url = img.get('src', '') + img_url = img.get("src", "") if img_url: img_url = urljoin(url, img_url) else: continue - + # Get alt text, width, and height - alt_text = img.get('alt', None) - width = int(img.get('width', 0)) or None - height = int(img.get('height', 0)) or None - - images.append(WebImage( - url=img_url, - alt_text=alt_text, - width=width, - height=height - )) - + alt_text = img.get("alt", None) + width = int(img.get("width", 0)) or None + height = int(img.get("height", 0)) or None + + images.append(WebImage(url=img_url, alt_text=alt_text, width=width, height=height)) + return images - + except requests.exceptions.RequestException as e: - logger.error(f"Error accessing page {url}: {str(e)}") - raise ValueError(f"Failed to access URL: {str(e)}") + logger.exception(f"Error accessing page {url}: {e!s}") + msg = f"Failed to access URL: {e!s}" + raise ValueError(msg) except Exception as e: - logger.error(f"Error extracting images from {url}: {str(e)}") - raise ValueError(f"Error extracting images: {str(e)}") \ No newline at end of file + logger.exception(f"Error extracting images from {url}: {e!s}") + msg = f"Error extracting images: {e!s}" + raise ValueError(msg)