codegen-sh · codegen-sh · Mar 14, 2025 · Mar 14, 2025
@@ -23,6 +23,7 @@
     SearchTool,
     # SemanticEditTool,
     ViewFileTool,
+    WebBrowserTool,
 )
 
 from .graph import create_react_agent
@@ -134,6 +135,7 @@ def create_chat_agent(
         MoveSymbolTool(codebase),
         RevealSymbolTool(codebase),
         RelaceEditTool(codebase),
+        WebBrowserTool(codebase),
     ]
 
     if additional_tools:

@@ -25,6 +25,7 @@
 from codegen.extensions.tools.search import search
 from codegen.extensions.tools.semantic_edit import semantic_edit
 from codegen.extensions.tools.semantic_search import semantic_search
+from codegen.extensions.tools.web_browser import browse_web
 from codegen.sdk.core.codebase import Codebase
 
 from ..tools import (
@@ -850,11 +851,10 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]:
         RevealSymbolTool(codebase),
         RunBashCommandTool(),  # Note: This tool doesn't need the codebase
         SearchTool(codebase),
-        # SemanticEditTool(codebase),
-        # SemanticSearchTool(codebase),
         ViewFileTool(codebase),
         RelaceEditTool(codebase),
         ReflectionTool(codebase),
+        WebBrowserTool(codebase),
         # Github
         GithubCreatePRTool(codebase),
         GithubCreatePRCommentTool(codebase),
@@ -1024,3 +1024,44 @@ def _run(
         result = perform_reflection(context_summary=context_summary, findings_so_far=findings_so_far, current_challenges=current_challenges, reflection_focus=reflection_focus, codebase=self.codebase)
 
         return result.render()
+
+
+class WebBrowserInput(BaseModel):
+    """Input for web browser tool."""
+
+    url: str = Field(..., description="URL to browse (must include http:// or https://)")
+    extract_text_only: bool = Field(default=True, description="Whether to extract only text content (True) or include HTML (False)")
+    timeout: int = Field(default=10, description="Request timeout in seconds")
+    max_content_length: int = Field(default=10000, description="Maximum content length to return")
+
+
+class WebBrowserTool(BaseTool):
+    """Tool for browsing websites and extracting content."""
+
+    name: ClassVar[str] = "browse_web"
+    description: ClassVar[str] = """
+    Browse a website and extract its content.
+    This tool allows you to access web pages, retrieve information, and analyze online content.
+    Useful for researching documentation, checking references, or gathering information from websites.
+    """
+    args_schema: ClassVar[type[BaseModel]] = WebBrowserInput
+    codebase: Codebase = Field(exclude=True)
+
+    def __init__(self, codebase: Codebase) -> None:
+        super().__init__(codebase=codebase)
+
+    def _run(
+        self,
+        url: str,
+        extract_text_only: bool = True,
+        timeout: int = 10,
+        max_content_length: int = 10000,
+    ) -> str:
+        result = browse_web(
+            self.codebase,
+            url=url,
+            extract_text_only=extract_text_only,
+            timeout=timeout,
+            max_content_length=max_content_length,
+        )
+        return result.render()
@@ -25,8 +25,11 @@
 from .semantic_edit import semantic_edit
 from .semantic_search import semantic_search
 from .view_file import view_file
+from .web_browser import browse_web
 
 __all__ = [
+    # Web operations
+    "browse_web",
     # Git operations
     "commit",
     # File operations

@@ -0,0 +1,142 @@
+"""Web browser tool for accessing web content.
+
+This tool allows fetching web pages and extracting their content,
+enabling Codegen to browse websites and retrieve information.
+"""
+
+from typing import ClassVar
+from urllib.parse import urlparse
+
+import requests
+from bs4 import BeautifulSoup
+from pydantic import Field
+
+from codegen.sdk.core.codebase import Codebase
+
+from .observation import Observation
+
+
+class WebBrowserObservation(Observation):
+    """Response from browsing a web page."""
+
+    url: str = Field(
+        description="The URL that was accessed",
+    )
+    title: str = Field(
+        description="The title of the web page",
+    )
+    content: str = Field(
+        description="The extracted content from the web page",
+    )
+    status_code: int = Field(
+        description="HTTP status code of the response",
+    )
+
+    str_template: ClassVar[str] = "Browsed {url} (Status: {status_code})"
+
+    def render(self) -> str:
+        """Render web browser results in a readable format."""
+        if self.status == "error":
+            return f"[WEB BROWSER ERROR]: {self.error}"
+
+        lines = [
+            f"[WEB PAGE]: {self.url}",
+            f"Status: {self.status_code}",
+            f"Title: {self.title}",
+            "",
+            "Content:",
+            "----------",
+            self.content[:2000] + ("..." if len(self.content) > 2000 else ""),
+            "----------",
+        ]
+        return "\n".join(lines)
+
+
+def browse_web(
+    codebase: Codebase,
+    url: str,
+    extract_text_only: bool = True,
+    timeout: int = 10,
+    max_content_length: int = 10000,
+) -> WebBrowserObservation:
+    """Browse a web page and extract its content.
+
+    Args:
+        codebase: The codebase to operate on (not used but required for tool interface)
+        url: The URL to browse
+        extract_text_only: Whether to extract only text content (default: True)
+        timeout: Request timeout in seconds (default: 10)
+        max_content_length: Maximum content length to return (default: 10000)
+
+    Returns:
+        WebBrowserObservation containing the web page content and metadata
+    """
+    # Validate URL
+    parsed_url = urlparse(url)
+    if not parsed_url.scheme or not parsed_url.netloc:
+        return WebBrowserObservation(
+            status="error",
+            error=f"Invalid URL: {url}. Must include scheme (http:// or https://) and domain.",
+            url=url,
+            title="",
+            content="",
+            status_code=0,
+        )
+
+    # Add scheme if missing
+    if not parsed_url.scheme:
+        url = f"https://{url}"
+
+    try:
+        # Set user agent to avoid being blocked
+        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
+
+        # Make the request
+        response = requests.get(url, headers=headers, timeout=timeout)
+        response.raise_for_status()
+
+        # Parse the HTML
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        # Extract title
+        title = soup.title.string if soup.title else "No title found"
+
+        # Extract content based on preference
+        if extract_text_only:
+            # Remove script and style elements
+            for script in soup(["script", "style"]):
+                script.extract()
+
+            # Get text and clean it up
+            text = soup.get_text()
+            # Break into lines and remove leading/trailing space
+            lines = (line.strip() for line in text.splitlines())
+            # Break multi-headlines into a line each
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            # Remove blank lines
+            content = "\n".join(chunk for chunk in chunks if chunk)
+        else:
+            # Return simplified HTML
+            content = str(soup)
+
+        # Truncate content if too long
+        if len(content) > max_content_length:
+            content = content[:max_content_length] + "... (content truncated)"
+
+        return WebBrowserObservation(
+            status="success",
+            url=url,
+            title=title,
+            content=content,
+            status_code=response.status_code,
+        )
+
+    except requests.exceptions.RequestException as e:
+        return WebBrowserObservation(
+            status="error",
+            error=f"Error accessing URL: {e!s}",
+            url=url,
+            title="",
+            content="",
+            status_code=getattr(e.response, "status_code", 0) if hasattr(e, "response") else 0,
+        )