From 886f35739e8ca048007a3b2d7d7f309f59987cdd Mon Sep 17 00:00:00 2001
From: codegen-bot <team+codegenbot@codegen.sh>
Date: Fri, 14 Mar 2025 21:49:17 +0000
Subject: [PATCH 1/2] Add web browser tool for accessing web content

---
 src/codegen/extensions/langchain/agent.py   |   4 +-
 src/codegen/extensions/langchain/tools.py   |  54 +++++++-
 src/codegen/extensions/tools/__init__.py    |   5 +-
 src/codegen/extensions/tools/web_browser.py | 145 ++++++++++++++++++++
 4 files changed, 204 insertions(+), 4 deletions(-)
 create mode 100644 src/codegen/extensions/tools/web_browser.py

diff --git a/src/codegen/extensions/langchain/agent.py b/src/codegen/extensions/langchain/agent.py
index 8917daa7f..7e8c7a838 100644
--- a/src/codegen/extensions/langchain/agent.py
+++ b/src/codegen/extensions/langchain/agent.py
@@ -23,6 +23,7 @@
     SearchTool,
     # SemanticEditTool,
     ViewFileTool,
+    WebBrowserTool,
 )
 
 from .graph import create_react_agent
@@ -134,6 +135,7 @@ def create_chat_agent(
         MoveSymbolTool(codebase),
         RevealSymbolTool(codebase),
         RelaceEditTool(codebase),
+        WebBrowserTool(codebase),
     ]
 
     if additional_tools:
@@ -214,4 +216,4 @@ def create_agent_with_tools(
 
     memory = MemorySaver() if memory else None
 
-    return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config)
+    return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config)
\ No newline at end of file
diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py
index 877b59f05..5ea63ef1e 100644
--- a/src/codegen/extensions/langchain/tools.py
+++ b/src/codegen/extensions/langchain/tools.py
@@ -25,6 +25,7 @@
 from codegen.extensions.tools.search import search
 from codegen.extensions.tools.semantic_edit import semantic_edit
 from codegen.extensions.tools.semantic_search import semantic_search
+from codegen.extensions.tools.web_browser import browse_web
 from codegen.sdk.core.codebase import Codebase
 
 from ..tools import (
@@ -850,11 +851,10 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]:
         RevealSymbolTool(codebase),
         RunBashCommandTool(),  # Note: This tool doesn't need the codebase
         SearchTool(codebase),
-        # SemanticEditTool(codebase),
-        # SemanticSearchTool(codebase),
         ViewFileTool(codebase),
         RelaceEditTool(codebase),
         ReflectionTool(codebase),
+        WebBrowserTool(codebase),
         # Github
         GithubCreatePRTool(codebase),
         GithubCreatePRCommentTool(codebase),
@@ -1024,3 +1024,53 @@ def _run(
         result = perform_reflection(context_summary=context_summary, findings_so_far=findings_so_far, current_challenges=current_challenges, reflection_focus=reflection_focus, codebase=self.codebase)
 
         return result.render()
+
+
+class WebBrowserInput(BaseModel):
+    """Input for web browser tool."""
+
+    url: str = Field(..., description="URL to browse (must include http:// or https://)")
+    extract_text_only: bool = Field(
+        default=True, 
+        description="Whether to extract only text content (True) or include HTML (False)"
+    )
+    timeout: int = Field(
+        default=10,
+        description="Request timeout in seconds"
+    )
+    max_content_length: int = Field(
+        default=10000,
+        description="Maximum content length to return"
+    )
+
+
+class WebBrowserTool(BaseTool):
+    """Tool for browsing websites and extracting content."""
+
+    name: ClassVar[str] = "browse_web"
+    description: ClassVar[str] = """
+    Browse a website and extract its content.
+    This tool allows you to access web pages, retrieve information, and analyze online content.
+    Useful for researching documentation, checking references, or gathering information from websites.
+    """
+    args_schema: ClassVar[type[BaseModel]] = WebBrowserInput
+    codebase: Codebase = Field(exclude=True)
+
+    def __init__(self, codebase: Codebase) -> None:
+        super().__init__(codebase=codebase)
+
+    def _run(
+        self,
+        url: str,
+        extract_text_only: bool = True,
+        timeout: int = 10,
+        max_content_length: int = 10000,
+    ) -> str:
+        result = browse_web(
+            self.codebase,
+            url=url,
+            extract_text_only=extract_text_only,
+            timeout=timeout,
+            max_content_length=max_content_length,
+        )
+        return result.render()
\ No newline at end of file
diff --git a/src/codegen/extensions/tools/__init__.py b/src/codegen/extensions/tools/__init__.py
index 8f49b68a8..9d007ce09 100644
--- a/src/codegen/extensions/tools/__init__.py
+++ b/src/codegen/extensions/tools/__init__.py
@@ -25,6 +25,7 @@
 from .semantic_edit import semantic_edit
 from .semantic_search import semantic_search
 from .view_file import view_file
+from .web_browser import browse_web
 
 __all__ = [
     # Git operations
@@ -57,4 +58,6 @@
     "semantic_search",
     "view_file",
     "view_pr",
-]
+    # Web operations
+    "browse_web",
+]
\ No newline at end of file
diff --git a/src/codegen/extensions/tools/web_browser.py b/src/codegen/extensions/tools/web_browser.py
new file mode 100644
index 000000000..18c1debbd
--- /dev/null
+++ b/src/codegen/extensions/tools/web_browser.py
@@ -0,0 +1,145 @@
+"""Web browser tool for accessing web content.
+
+This tool allows fetching web pages and extracting their content,
+enabling Codegen to browse websites and retrieve information.
+"""
+
+import re
+import requests
+from typing import ClassVar, Optional
+from urllib.parse import urlparse
+
+from bs4 import BeautifulSoup
+from pydantic import Field
+
+from codegen.sdk.core.codebase import Codebase
+
+from .observation import Observation
+
+
+class WebBrowserObservation(Observation):
+    """Response from browsing a web page."""
+
+    url: str = Field(
+        description="The URL that was accessed",
+    )
+    title: str = Field(
+        description="The title of the web page",
+    )
+    content: str = Field(
+        description="The extracted content from the web page",
+    )
+    status_code: int = Field(
+        description="HTTP status code of the response",
+    )
+
+    str_template: ClassVar[str] = "Browsed {url} (Status: {status_code})"
+
+    def render(self) -> str:
+        """Render web browser results in a readable format."""
+        if self.status == "error":
+            return f"[WEB BROWSER ERROR]: {self.error}"
+
+        lines = [
+            f"[WEB PAGE]: {self.url}",
+            f"Status: {self.status_code}",
+            f"Title: {self.title}",
+            "",
+            "Content:",
+            "----------",
+            self.content[:2000] + ("..." if len(self.content) > 2000 else ""),
+            "----------",
+        ]
+        return "\n".join(lines)
+
+
+def browse_web(
+    codebase: Codebase,
+    url: str,
+    extract_text_only: bool = True,
+    timeout: int = 10,
+    max_content_length: int = 10000,
+) -> WebBrowserObservation:
+    """Browse a web page and extract its content.
+
+    Args:
+        codebase: The codebase to operate on (not used but required for tool interface)
+        url: The URL to browse
+        extract_text_only: Whether to extract only text content (default: True)
+        timeout: Request timeout in seconds (default: 10)
+        max_content_length: Maximum content length to return (default: 10000)
+
+    Returns:
+        WebBrowserObservation containing the web page content and metadata
+    """
+    # Validate URL
+    parsed_url = urlparse(url)
+    if not parsed_url.scheme or not parsed_url.netloc:
+        return WebBrowserObservation(
+            status="error",
+            error=f"Invalid URL: {url}. Must include scheme (http:// or https://) and domain.",
+            url=url,
+            title="",
+            content="",
+            status_code=0,
+        )
+
+    # Add scheme if missing
+    if not parsed_url.scheme:
+        url = f"https://{url}"
+
+    try:
+        # Set user agent to avoid being blocked
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+        }
+        
+        # Make the request
+        response = requests.get(url, headers=headers, timeout=timeout)
+        response.raise_for_status()
+        
+        # Parse the HTML
+        soup = BeautifulSoup(response.text, "html.parser")
+        
+        # Extract title
+        title = soup.title.string if soup.title else "No title found"
+        
+        # Extract content based on preference
+        if extract_text_only:
+            # Remove script and style elements
+            for script in soup(["script", "style"]):
+                script.extract()
+                
+            # Get text and clean it up
+            text = soup.get_text()
+            # Break into lines and remove leading/trailing space
+            lines = (line.strip() for line in text.splitlines())
+            # Break multi-headlines into a line each
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            # Remove blank lines
+            content = "\n".join(chunk for chunk in chunks if chunk)
+        else:
+            # Return simplified HTML
+            content = str(soup)
+        
+        # Truncate content if too long
+        if len(content) > max_content_length:
+            content = content[:max_content_length] + "... (content truncated)"
+        
+        return WebBrowserObservation(
+            status="success",
+            url=url,
+            title=title,
+            content=content,
+            status_code=response.status_code,
+        )
+        
+    except requests.exceptions.RequestException as e:
+        return WebBrowserObservation(
+            status="error",
+            error=f"Error accessing URL: {str(e)}",
+            url=url,
+            title="",
+            content="",
+            status_code=getattr(e.response, "status_code", 0) if hasattr(e, "response") else 0,
+        )
\ No newline at end of file

From 3fa9e1f6c76cb26dc6f7a624585bd11863987a25 Mon Sep 17 00:00:00 2001
From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com>
Date: Fri, 14 Mar 2025 21:50:08 +0000
Subject: [PATCH 2/2] Automated pre-commit update

---
 src/codegen/extensions/langchain/agent.py   |  2 +-
 src/codegen/extensions/langchain/tools.py   | 17 +++---------
 src/codegen/extensions/tools/__init__.py    |  6 ++---
 src/codegen/extensions/tools/web_browser.py | 29 +++++++++------------
 4 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/src/codegen/extensions/langchain/agent.py b/src/codegen/extensions/langchain/agent.py
index 7e8c7a838..8aad4d0ef 100644
--- a/src/codegen/extensions/langchain/agent.py
+++ b/src/codegen/extensions/langchain/agent.py
@@ -216,4 +216,4 @@ def create_agent_with_tools(
 
     memory = MemorySaver() if memory else None
 
-    return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config)
\ No newline at end of file
+    return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config)
diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py
index 5ea63ef1e..47fa832fd 100644
--- a/src/codegen/extensions/langchain/tools.py
+++ b/src/codegen/extensions/langchain/tools.py
@@ -1030,18 +1030,9 @@ class WebBrowserInput(BaseModel):
     """Input for web browser tool."""
 
     url: str = Field(..., description="URL to browse (must include http:// or https://)")
-    extract_text_only: bool = Field(
-        default=True, 
-        description="Whether to extract only text content (True) or include HTML (False)"
-    )
-    timeout: int = Field(
-        default=10,
-        description="Request timeout in seconds"
-    )
-    max_content_length: int = Field(
-        default=10000,
-        description="Maximum content length to return"
-    )
+    extract_text_only: bool = Field(default=True, description="Whether to extract only text content (True) or include HTML (False)")
+    timeout: int = Field(default=10, description="Request timeout in seconds")
+    max_content_length: int = Field(default=10000, description="Maximum content length to return")
 
 
 class WebBrowserTool(BaseTool):
@@ -1073,4 +1064,4 @@ def _run(
             timeout=timeout,
             max_content_length=max_content_length,
         )
-        return result.render()
\ No newline at end of file
+        return result.render()
diff --git a/src/codegen/extensions/tools/__init__.py b/src/codegen/extensions/tools/__init__.py
index 9d007ce09..57d328d14 100644
--- a/src/codegen/extensions/tools/__init__.py
+++ b/src/codegen/extensions/tools/__init__.py
@@ -28,6 +28,8 @@
 from .web_browser import browse_web
 
 __all__ = [
+    # Web operations
+    "browse_web",
     # Git operations
     "commit",
     # File operations
@@ -58,6 +60,4 @@
     "semantic_search",
     "view_file",
     "view_pr",
-    # Web operations
-    "browse_web",
-]
\ No newline at end of file
+]
diff --git a/src/codegen/extensions/tools/web_browser.py b/src/codegen/extensions/tools/web_browser.py
index 18c1debbd..570049a6e 100644
--- a/src/codegen/extensions/tools/web_browser.py
+++ b/src/codegen/extensions/tools/web_browser.py
@@ -4,11 +4,10 @@
 enabling Codegen to browse websites and retrieve information.
 """
 
-import re
-import requests
-from typing import ClassVar, Optional
+from typing import ClassVar
 from urllib.parse import urlparse
 
+import requests
 from bs4 import BeautifulSoup
 from pydantic import Field
 
@@ -90,26 +89,24 @@ def browse_web(
 
     try:
         # Set user agent to avoid being blocked
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-        }
-        
+        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
+
         # Make the request
         response = requests.get(url, headers=headers, timeout=timeout)
         response.raise_for_status()
-        
+
         # Parse the HTML
         soup = BeautifulSoup(response.text, "html.parser")
-        
+
         # Extract title
         title = soup.title.string if soup.title else "No title found"
-        
+
         # Extract content based on preference
         if extract_text_only:
             # Remove script and style elements
             for script in soup(["script", "style"]):
                 script.extract()
-                
+
             # Get text and clean it up
             text = soup.get_text()
             # Break into lines and remove leading/trailing space
@@ -121,11 +118,11 @@ def browse_web(
         else:
             # Return simplified HTML
             content = str(soup)
-        
+
         # Truncate content if too long
         if len(content) > max_content_length:
             content = content[:max_content_length] + "... (content truncated)"
-        
+
         return WebBrowserObservation(
             status="success",
             url=url,
@@ -133,13 +130,13 @@ def browse_web(
             content=content,
             status_code=response.status_code,
         )
-        
+
     except requests.exceptions.RequestException as e:
         return WebBrowserObservation(
             status="error",
-            error=f"Error accessing URL: {str(e)}",
+            error=f"Error accessing URL: {e!s}",
             url=url,
             title="",
             content="",
             status_code=getattr(e.response, "status_code", 0) if hasattr(e, "response") else 0,
-        )
\ No newline at end of file
+        )