diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index 877b59f05..64a993fb1 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -25,6 +25,12 @@ from codegen.extensions.tools.search import search from codegen.extensions.tools.semantic_edit import semantic_edit from codegen.extensions.tools.semantic_search import semantic_search +from codegen.extensions.web.web import ( + web_browse_page_tool, + web_extract_images_tool, + web_search_tool, +) +from codegen.extensions.web.web_client import WebClient from codegen.sdk.core.codebase import Codebase from ..tools import ( @@ -43,6 +49,7 @@ ) from ..tools.relace_edit_prompts import RELACE_EDIT_PROMPT from ..tools.semantic_edit_prompts import FILE_EDIT_PROMPT +from .web_tools import WebBrowsePageTool, WebExtractImagesTool, WebSearchTool class ViewFileInput(BaseModel): @@ -823,6 +830,59 @@ def _run(self, content: str) -> str: return "✅ Message sent successfully" +######################################################################################################################## +# WEB +######################################################################################################################## + + +class WebBrowsePageTool(BaseTool): + """Tool for browsing a web page.""" + + name: ClassVar[str] = "web_browse_page" + description: ClassVar[str] = "Browse a web page and extract relevant information" + args_schema: ClassVar[type[BaseModel]] = WebBrowsePageTool.args_schema + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, url: str) -> str: + result = web_browse_page_tool(self.client, url) + return result.render() + + +class WebSearchTool(BaseTool): + """Tool for searching the web.""" + + name: ClassVar[str] = "web_search" + description: ClassVar[str] = "Search the web for information" + args_schema: ClassVar[type[BaseModel]] = WebSearchTool.args_schema + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, query: str) -> str: + result = web_search_tool(self.client, query) + return result.render() + + +class WebExtractImagesTool(BaseTool): + """Tool for extracting images from a web page.""" + + name: ClassVar[str] = "web_extract_images" + description: ClassVar[str] = "Extract images from a web page" + args_schema: ClassVar[type[BaseModel]] = WebExtractImagesTool.args_schema + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, url: str) -> str: + result = web_extract_images_tool(self.client, url) + return result.render() + + ######################################################################################################################## # EXPORT ######################################################################################################################## @@ -868,6 +928,10 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]: LinearSearchIssuesTool(codebase), LinearCreateIssueTool(codebase), LinearGetTeamsTool(codebase), + # Web + WebBrowsePageTool(WebClient()), + WebSearchTool(WebClient()), + WebExtractImagesTool(WebClient()), ] diff --git a/src/codegen/extensions/langchain/web_tools.py b/src/codegen/extensions/langchain/web_tools.py new file mode 100644 index 000000000..0bf259d22 --- /dev/null +++ b/src/codegen/extensions/langchain/web_tools.py @@ -0,0 +1,81 @@ +"""LangChain tools for web browsing.""" + +from typing import ClassVar + +from langchain_core.tools.base import BaseTool +from pydantic import BaseModel, Field + +from codegen.extensions.web.web import ( + web_browse_page_tool, + web_extract_images_tool, + web_search_tool, +) +from codegen.extensions.web.web_client import WebClient + + +class WebBrowsePageInput(BaseModel): + """Input for browsing a web page.""" + + url: str = Field(..., description="URL of the web page to browse") + + +class WebBrowsePageTool(BaseTool): + """Tool for browsing web pages.""" + + name: ClassVar[str] = "web_browse_page" + description: ClassVar[str] = "Browse a web page and extract its content" + args_schema: ClassVar[type[BaseModel]] = WebBrowsePageInput + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, url: str) -> str: + result = web_browse_page_tool(self.client, url) + return result.render() + + +class WebSearchInput(BaseModel): + """Input for web search.""" + + query: str = Field(..., description="Search query string") + num_results: int = Field(default=10, description="Maximum number of results to return") + + +class WebSearchTool(BaseTool): + """Tool for searching the web.""" + + name: ClassVar[str] = "web_search" + description: ClassVar[str] = "Search the web using a search engine" + args_schema: ClassVar[type[BaseModel]] = WebSearchInput + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, query: str, num_results: int = 10) -> str: + result = web_search_tool(self.client, query, num_results) + return result.render() + + +class WebExtractImagesInput(BaseModel): + """Input for extracting images from a web page.""" + + url: str = Field(..., description="URL of the web page") + max_images: int = Field(default=20, description="Maximum number of images to extract") + + +class WebExtractImagesTool(BaseTool): + """Tool for extracting images from web pages.""" + + name: ClassVar[str] = "web_extract_images" + description: ClassVar[str] = "Extract images from a web page" + args_schema: ClassVar[type[BaseModel]] = WebExtractImagesInput + client: WebClient = Field(exclude=True) + + def __init__(self, client: WebClient) -> None: + super().__init__(client=client) + + def _run(self, url: str, max_images: int = 20) -> str: + result = web_extract_images_tool(self.client, url, max_images) + return result.render() diff --git a/src/codegen/extensions/web/__init__.py b/src/codegen/extensions/web/__init__.py new file mode 100644 index 000000000..8e1d9363b --- /dev/null +++ b/src/codegen/extensions/web/__init__.py @@ -0,0 +1,15 @@ +"""Web browsing functionality.""" + +from codegen.extensions.web.web import ( + web_browse_page_tool, + web_extract_images_tool, + web_search_tool, +) +from codegen.extensions.web.web_client import WebClient + +__all__ = [ + "WebClient", + "web_browse_page_tool", + "web_extract_images_tool", + "web_search_tool", +] diff --git a/src/codegen/extensions/web/types.py b/src/codegen/extensions/web/types.py new file mode 100644 index 000000000..4bbb5b785 --- /dev/null +++ b/src/codegen/extensions/web/types.py @@ -0,0 +1,29 @@ +"""Types for web browsing functionality.""" + +from pydantic import BaseModel, Field + + +class WebPage(BaseModel): + """Represents a web page.""" + + url: str = Field(description="URL of the web page") + title: str = Field(description="Title of the web page") + content: str = Field(description="Main content of the web page") + status_code: int = Field(description="HTTP status code of the response") + + +class WebSearchResult(BaseModel): + """Represents a single search result.""" + + title: str = Field(description="Title of the search result") + url: str = Field(description="URL of the search result") + snippet: str = Field(description="Snippet or description of the search result") + + +class WebImage(BaseModel): + """Represents an image from a web page.""" + + url: str = Field(description="URL of the image") + alt_text: str | None = Field(None, description="Alternative text for the image") + width: int | None = Field(None, description="Width of the image in pixels") + height: int | None = Field(None, description="Height of the image in pixels") diff --git a/src/codegen/extensions/web/web.py b/src/codegen/extensions/web/web.py new file mode 100644 index 000000000..0120f7f49 --- /dev/null +++ b/src/codegen/extensions/web/web.py @@ -0,0 +1,191 @@ +"""Tools for web browsing functionality.""" + +from typing import ClassVar + +import requests +from pydantic import Field + +from codegen.extensions.tools.observation import Observation +from codegen.extensions.web.web_client import WebClient + + +class WebBrowsePageObservation(Observation): + """Response from browsing a web page.""" + + url: str = Field(description="URL of the browsed page") + title: str = Field(description="Title of the web page") + content: str = Field(description="Content of the web page") + status_code: int = Field(description="HTTP status code") + + str_template: ClassVar[str] = "Browsed page: {title} ({url})" + + +class WebSearchObservation(Observation): + """Response from web search.""" + + query: str = Field(description="Search query used") + results: list[dict] = Field(description="List of search results") + + str_template: ClassVar[str] = "Found {result_count} results for '{query}'" + + def _get_details(self) -> dict[str, str | int]: + """Get details for string representation.""" + return { + "result_count": len(self.results), + "query": self.query, + } + + +class WebExtractImagesObservation(Observation): + """Response from extracting images from a web page.""" + + url: str = Field(description="URL of the web page") + images: list[dict] = Field(description="List of extracted images") + + str_template: ClassVar[str] = "Extracted {image_count} images from {url}" + + def _get_details(self) -> dict[str, str | int]: + """Get details for string representation.""" + return { + "image_count": len(self.images), + "url": self.url, + } + + +def web_browse_page_tool(client: WebClient, url: str) -> WebBrowsePageObservation: + """Browse a web page and extract its content. + + Args: + client: WebClient instance + url: URL of the web page to browse + + Returns: + WebBrowsePageObservation with the page content + """ + try: + page = client.browse_page(url) + return WebBrowsePageObservation( + status="success", + url=page.url, + title=page.title, + content=page.content, + status_code=page.status_code, + ) + except requests.exceptions.RequestException as e: + # Network-related errors + return WebBrowsePageObservation( + status="error", + error=f"Network error when browsing page: {e!s}", + url=url, + title="Error", + content="", + status_code=0, + ) + except ValueError as e: + # Input validation errors + return WebBrowsePageObservation( + status="error", + error=f"Invalid input: {e!s}", + url=url, + title="Error", + content="", + status_code=0, + ) + except Exception as e: + # Catch-all for other errors + return WebBrowsePageObservation( + status="error", + error=f"Failed to browse page: {e!s}", + url=url, + title="Error", + content="", + status_code=0, + ) + + +def web_search_tool(client: WebClient, query: str, num_results: int = 10) -> WebSearchObservation: + """Search the web using a search engine. + + Args: + client: WebClient instance + query: Search query string + num_results: Maximum number of results to return + + Returns: + WebSearchObservation with search results + """ + try: + results = client.search(query, num_results) + return WebSearchObservation( + status="success", + query=query, + results=[result.dict() for result in results], + ) + except requests.exceptions.RequestException as e: + # Network-related errors + return WebSearchObservation( + status="error", + error=f"Network error when searching: {e!s}", + query=query, + results=[], + ) + except ValueError as e: + # Input validation errors + return WebSearchObservation( + status="error", + error=f"Invalid input: {e!s}", + query=query, + results=[], + ) + except Exception as e: + # Catch-all for other errors + return WebSearchObservation( + status="error", + error=f"Failed to search: {e!s}", + query=query, + results=[], + ) + + +def web_extract_images_tool(client: WebClient, url: str, max_images: int = 20) -> WebExtractImagesObservation: + """Extract images from a web page. + + Args: + client: WebClient instance + url: URL of the web page + max_images: Maximum number of images to extract + + Returns: + WebExtractImagesObservation with extracted images + """ + try: + images = client.extract_images(url, max_images) + return WebExtractImagesObservation( + status="success", + url=url, + images=[image.dict() for image in images], + ) + except requests.exceptions.RequestException as e: + # Network-related errors + return WebExtractImagesObservation( + status="error", + error=f"Network error when extracting images: {e!s}", + url=url, + images=[], + ) + except ValueError as e: + # Input validation errors + return WebExtractImagesObservation( + status="error", + error=f"Invalid input: {e!s}", + url=url, + images=[], + ) + except Exception as e: + # Catch-all for other errors + return WebExtractImagesObservation( + status="error", + error=f"Failed to extract images: {e!s}", + url=url, + images=[], + ) diff --git a/src/codegen/extensions/web/web_client.py b/src/codegen/extensions/web/web_client.py new file mode 100644 index 000000000..2be61c580 --- /dev/null +++ b/src/codegen/extensions/web/web_client.py @@ -0,0 +1,173 @@ +"""Client for web browsing functionality.""" + +import os +from typing import Optional +from urllib.parse import urljoin, urlparse + +import requests +from bs4 import BeautifulSoup + +from codegen.extensions.web.types import WebImage, WebPage, WebSearchResult +from codegen.shared.logging.get_logger import get_logger + +logger = get_logger(__name__) + + +class WebClient: + """Client for web browsing functionality.""" + + def __init__(self, user_agent: Optional[str] = None, max_retries: int = 3, timeout: int = 10): + """Initialize the web client. + + Args: + user_agent: Custom user agent string. If None, a default one will be used. + max_retries: Maximum number of retries for failed requests. + timeout: Timeout in seconds for requests. + """ + self.timeout = timeout + + # Set up a session with retry logic + self.session = requests.Session() + + # Configure retries + adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) + self.session.mount("http://", adapter) + self.session.mount("https://", adapter) + + # Set user agent + if not user_agent: + user_agent = os.getenv("WEB_USER_AGENT", "Codegen Web Browser Tool/1.0") + self.session.headers.update({"User-Agent": user_agent}) + + def browse_page(self, url: str) -> WebPage: + """Browse a web page and extract its content. + + Args: + url: URL of the web page to browse. + + Returns: + WebPage object containing the page content. + + Raises: + ValueError: If the URL is invalid or the page cannot be accessed. + """ + try: + # Validate URL + parsed_url = urlparse(url) + if not parsed_url.scheme or not parsed_url.netloc: + msg = f"Invalid URL: {url}" + raise ValueError(msg) + + # Make the request + response = self.session.get(url, timeout=self.timeout) + response.raise_for_status() + + # Parse the HTML + soup = BeautifulSoup(response.text, "html.parser") + + # Extract title + title = soup.title.string if soup.title else "No title" + + # Extract main content (simplified approach) + # Remove script and style elements + for script in soup(["script", "style"]): + script.extract() + + # Get text content + content = soup.get_text(separator="\n", strip=True) + + # Truncate content if too long (100k chars max) + if len(content) > 100000: + content = content[:100000] + "... [content truncated]" + + return WebPage(url=url, title=title, content=content, status_code=response.status_code) + + except requests.exceptions.RequestException as e: + logger.exception(f"Error browsing page {url}: {e!s}") + msg = f"Failed to access URL: {e!s}" + raise ValueError(msg) + except Exception as e: + logger.exception(f"Error processing page {url}: {e!s}") + msg = f"Error processing page: {e!s}" + raise ValueError(msg) + + def search(self, query: str, num_results: int = 10) -> list[WebSearchResult]: + """Search the web using a search engine API. + + Note: This is a placeholder. In a real implementation, you would integrate + with a search engine API like Google Custom Search, Bing, or DuckDuckGo. + + Args: + query: Search query string. + num_results: Maximum number of results to return. + + Returns: + List of WebSearchResult objects. + + Raises: + ValueError: If the search fails. + """ + # This is a placeholder. In a real implementation, you would: + # 1. Call a search engine API + # 2. Parse the results + # 3. Return them as WebSearchResult objects + + # For now, return a message explaining this is a placeholder + placeholder = WebSearchResult( + title="Search Functionality Placeholder", + url="https://example.com/search", + snippet="This is a placeholder for search functionality. In a real implementation, this would integrate with a search engine API like Google Custom Search, Bing, or DuckDuckGo.", + ) + + return [placeholder] + + def extract_images(self, url: str, max_images: int = 20) -> list[WebImage]: + """Extract images from a web page. + + Args: + url: URL of the web page. + max_images: Maximum number of images to extract. + + Returns: + List of WebImage objects. + + Raises: + ValueError: If the URL is invalid or the page cannot be accessed. + """ + try: + # Make the request + response = self.session.get(url, timeout=self.timeout) + response.raise_for_status() + + # Parse the HTML + soup = BeautifulSoup(response.text, "html.parser") + + # Find all image tags + img_tags = soup.find_all("img", limit=max_images) + + images = [] + for img in img_tags: + # Get image URL (handle relative URLs) + img_url = img.get("src", "") + if img_url: + img_url = urljoin(url, img_url) + else: + continue + + # Get alt text, width, and height + alt_text = img.get("alt", None) + width = int(img.get("width", 0)) or None + height = int(img.get("height", 0)) or None + + images.append(WebImage(url=img_url, alt_text=alt_text, width=width, height=height)) + + return images + + except requests.exceptions.RequestException as e: + logger.exception(f"Error accessing page {url}: {e!s}") + msg = f"Failed to access URL: {e!s}" + raise ValueError(msg) + except Exception as e: + logger.exception(f"Error extracting images from {url}: {e!s}") + msg = f"Error extracting images: {e!s}" + raise ValueError(msg)