From 886f35739e8ca048007a3b2d7d7f309f59987cdd Mon Sep 17 00:00:00 2001 From: codegen-bot Date: Fri, 14 Mar 2025 21:49:17 +0000 Subject: [PATCH 1/2] Add web browser tool for accessing web content --- src/codegen/extensions/langchain/agent.py | 4 +- src/codegen/extensions/langchain/tools.py | 54 +++++++- src/codegen/extensions/tools/__init__.py | 5 +- src/codegen/extensions/tools/web_browser.py | 145 ++++++++++++++++++++ 4 files changed, 204 insertions(+), 4 deletions(-) create mode 100644 src/codegen/extensions/tools/web_browser.py diff --git a/src/codegen/extensions/langchain/agent.py b/src/codegen/extensions/langchain/agent.py index 8917daa7f..7e8c7a838 100644 --- a/src/codegen/extensions/langchain/agent.py +++ b/src/codegen/extensions/langchain/agent.py @@ -23,6 +23,7 @@ SearchTool, # SemanticEditTool, ViewFileTool, + WebBrowserTool, ) from .graph import create_react_agent @@ -134,6 +135,7 @@ def create_chat_agent( MoveSymbolTool(codebase), RevealSymbolTool(codebase), RelaceEditTool(codebase), + WebBrowserTool(codebase), ] if additional_tools: @@ -214,4 +216,4 @@ def create_agent_with_tools( memory = MemorySaver() if memory else None - return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config) + return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config) \ No newline at end of file diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index 877b59f05..5ea63ef1e 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -25,6 +25,7 @@ from codegen.extensions.tools.search import search from codegen.extensions.tools.semantic_edit import semantic_edit from codegen.extensions.tools.semantic_search import semantic_search +from codegen.extensions.tools.web_browser import browse_web from codegen.sdk.core.codebase import Codebase from ..tools import ( @@ -850,11 +851,10 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]: RevealSymbolTool(codebase), RunBashCommandTool(), # Note: This tool doesn't need the codebase SearchTool(codebase), - # SemanticEditTool(codebase), - # SemanticSearchTool(codebase), ViewFileTool(codebase), RelaceEditTool(codebase), ReflectionTool(codebase), + WebBrowserTool(codebase), # Github GithubCreatePRTool(codebase), GithubCreatePRCommentTool(codebase), @@ -1024,3 +1024,53 @@ def _run( result = perform_reflection(context_summary=context_summary, findings_so_far=findings_so_far, current_challenges=current_challenges, reflection_focus=reflection_focus, codebase=self.codebase) return result.render() + + +class WebBrowserInput(BaseModel): + """Input for web browser tool.""" + + url: str = Field(..., description="URL to browse (must include http:// or https://)") + extract_text_only: bool = Field( + default=True, + description="Whether to extract only text content (True) or include HTML (False)" + ) + timeout: int = Field( + default=10, + description="Request timeout in seconds" + ) + max_content_length: int = Field( + default=10000, + description="Maximum content length to return" + ) + + +class WebBrowserTool(BaseTool): + """Tool for browsing websites and extracting content.""" + + name: ClassVar[str] = "browse_web" + description: ClassVar[str] = """ + Browse a website and extract its content. + This tool allows you to access web pages, retrieve information, and analyze online content. + Useful for researching documentation, checking references, or gathering information from websites. + """ + args_schema: ClassVar[type[BaseModel]] = WebBrowserInput + codebase: Codebase = Field(exclude=True) + + def __init__(self, codebase: Codebase) -> None: + super().__init__(codebase=codebase) + + def _run( + self, + url: str, + extract_text_only: bool = True, + timeout: int = 10, + max_content_length: int = 10000, + ) -> str: + result = browse_web( + self.codebase, + url=url, + extract_text_only=extract_text_only, + timeout=timeout, + max_content_length=max_content_length, + ) + return result.render() \ No newline at end of file diff --git a/src/codegen/extensions/tools/__init__.py b/src/codegen/extensions/tools/__init__.py index 8f49b68a8..9d007ce09 100644 --- a/src/codegen/extensions/tools/__init__.py +++ b/src/codegen/extensions/tools/__init__.py @@ -25,6 +25,7 @@ from .semantic_edit import semantic_edit from .semantic_search import semantic_search from .view_file import view_file +from .web_browser import browse_web __all__ = [ # Git operations @@ -57,4 +58,6 @@ "semantic_search", "view_file", "view_pr", -] + # Web operations + "browse_web", +] \ No newline at end of file diff --git a/src/codegen/extensions/tools/web_browser.py b/src/codegen/extensions/tools/web_browser.py new file mode 100644 index 000000000..18c1debbd --- /dev/null +++ b/src/codegen/extensions/tools/web_browser.py @@ -0,0 +1,145 @@ +"""Web browser tool for accessing web content. + +This tool allows fetching web pages and extracting their content, +enabling Codegen to browse websites and retrieve information. +""" + +import re +import requests +from typing import ClassVar, Optional +from urllib.parse import urlparse + +from bs4 import BeautifulSoup +from pydantic import Field + +from codegen.sdk.core.codebase import Codebase + +from .observation import Observation + + +class WebBrowserObservation(Observation): + """Response from browsing a web page.""" + + url: str = Field( + description="The URL that was accessed", + ) + title: str = Field( + description="The title of the web page", + ) + content: str = Field( + description="The extracted content from the web page", + ) + status_code: int = Field( + description="HTTP status code of the response", + ) + + str_template: ClassVar[str] = "Browsed {url} (Status: {status_code})" + + def render(self) -> str: + """Render web browser results in a readable format.""" + if self.status == "error": + return f"[WEB BROWSER ERROR]: {self.error}" + + lines = [ + f"[WEB PAGE]: {self.url}", + f"Status: {self.status_code}", + f"Title: {self.title}", + "", + "Content:", + "----------", + self.content[:2000] + ("..." if len(self.content) > 2000 else ""), + "----------", + ] + return "\n".join(lines) + + +def browse_web( + codebase: Codebase, + url: str, + extract_text_only: bool = True, + timeout: int = 10, + max_content_length: int = 10000, +) -> WebBrowserObservation: + """Browse a web page and extract its content. + + Args: + codebase: The codebase to operate on (not used but required for tool interface) + url: The URL to browse + extract_text_only: Whether to extract only text content (default: True) + timeout: Request timeout in seconds (default: 10) + max_content_length: Maximum content length to return (default: 10000) + + Returns: + WebBrowserObservation containing the web page content and metadata + """ + # Validate URL + parsed_url = urlparse(url) + if not parsed_url.scheme or not parsed_url.netloc: + return WebBrowserObservation( + status="error", + error=f"Invalid URL: {url}. Must include scheme (http:// or https://) and domain.", + url=url, + title="", + content="", + status_code=0, + ) + + # Add scheme if missing + if not parsed_url.scheme: + url = f"https://{url}" + + try: + # Set user agent to avoid being blocked + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + + # Make the request + response = requests.get(url, headers=headers, timeout=timeout) + response.raise_for_status() + + # Parse the HTML + soup = BeautifulSoup(response.text, "html.parser") + + # Extract title + title = soup.title.string if soup.title else "No title found" + + # Extract content based on preference + if extract_text_only: + # Remove script and style elements + for script in soup(["script", "style"]): + script.extract() + + # Get text and clean it up + text = soup.get_text() + # Break into lines and remove leading/trailing space + lines = (line.strip() for line in text.splitlines()) + # Break multi-headlines into a line each + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + # Remove blank lines + content = "\n".join(chunk for chunk in chunks if chunk) + else: + # Return simplified HTML + content = str(soup) + + # Truncate content if too long + if len(content) > max_content_length: + content = content[:max_content_length] + "... (content truncated)" + + return WebBrowserObservation( + status="success", + url=url, + title=title, + content=content, + status_code=response.status_code, + ) + + except requests.exceptions.RequestException as e: + return WebBrowserObservation( + status="error", + error=f"Error accessing URL: {str(e)}", + url=url, + title="", + content="", + status_code=getattr(e.response, "status_code", 0) if hasattr(e, "response") else 0, + ) \ No newline at end of file From 3fa9e1f6c76cb26dc6f7a624585bd11863987a25 Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Fri, 14 Mar 2025 21:50:08 +0000 Subject: [PATCH 2/2] Automated pre-commit update --- src/codegen/extensions/langchain/agent.py | 2 +- src/codegen/extensions/langchain/tools.py | 17 +++--------- src/codegen/extensions/tools/__init__.py | 6 ++--- src/codegen/extensions/tools/web_browser.py | 29 +++++++++------------ 4 files changed, 21 insertions(+), 33 deletions(-) diff --git a/src/codegen/extensions/langchain/agent.py b/src/codegen/extensions/langchain/agent.py index 7e8c7a838..8aad4d0ef 100644 --- a/src/codegen/extensions/langchain/agent.py +++ b/src/codegen/extensions/langchain/agent.py @@ -216,4 +216,4 @@ def create_agent_with_tools( memory = MemorySaver() if memory else None - return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config) \ No newline at end of file + return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config) diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index 5ea63ef1e..47fa832fd 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -1030,18 +1030,9 @@ class WebBrowserInput(BaseModel): """Input for web browser tool.""" url: str = Field(..., description="URL to browse (must include http:// or https://)") - extract_text_only: bool = Field( - default=True, - description="Whether to extract only text content (True) or include HTML (False)" - ) - timeout: int = Field( - default=10, - description="Request timeout in seconds" - ) - max_content_length: int = Field( - default=10000, - description="Maximum content length to return" - ) + extract_text_only: bool = Field(default=True, description="Whether to extract only text content (True) or include HTML (False)") + timeout: int = Field(default=10, description="Request timeout in seconds") + max_content_length: int = Field(default=10000, description="Maximum content length to return") class WebBrowserTool(BaseTool): @@ -1073,4 +1064,4 @@ def _run( timeout=timeout, max_content_length=max_content_length, ) - return result.render() \ No newline at end of file + return result.render() diff --git a/src/codegen/extensions/tools/__init__.py b/src/codegen/extensions/tools/__init__.py index 9d007ce09..57d328d14 100644 --- a/src/codegen/extensions/tools/__init__.py +++ b/src/codegen/extensions/tools/__init__.py @@ -28,6 +28,8 @@ from .web_browser import browse_web __all__ = [ + # Web operations + "browse_web", # Git operations "commit", # File operations @@ -58,6 +60,4 @@ "semantic_search", "view_file", "view_pr", - # Web operations - "browse_web", -] \ No newline at end of file +] diff --git a/src/codegen/extensions/tools/web_browser.py b/src/codegen/extensions/tools/web_browser.py index 18c1debbd..570049a6e 100644 --- a/src/codegen/extensions/tools/web_browser.py +++ b/src/codegen/extensions/tools/web_browser.py @@ -4,11 +4,10 @@ enabling Codegen to browse websites and retrieve information. """ -import re -import requests -from typing import ClassVar, Optional +from typing import ClassVar from urllib.parse import urlparse +import requests from bs4 import BeautifulSoup from pydantic import Field @@ -90,26 +89,24 @@ def browse_web( try: # Set user agent to avoid being blocked - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - } - + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} + # Make the request response = requests.get(url, headers=headers, timeout=timeout) response.raise_for_status() - + # Parse the HTML soup = BeautifulSoup(response.text, "html.parser") - + # Extract title title = soup.title.string if soup.title else "No title found" - + # Extract content based on preference if extract_text_only: # Remove script and style elements for script in soup(["script", "style"]): script.extract() - + # Get text and clean it up text = soup.get_text() # Break into lines and remove leading/trailing space @@ -121,11 +118,11 @@ def browse_web( else: # Return simplified HTML content = str(soup) - + # Truncate content if too long if len(content) > max_content_length: content = content[:max_content_length] + "... (content truncated)" - + return WebBrowserObservation( status="success", url=url, @@ -133,13 +130,13 @@ def browse_web( content=content, status_code=response.status_code, ) - + except requests.exceptions.RequestException as e: return WebBrowserObservation( status="error", - error=f"Error accessing URL: {str(e)}", + error=f"Error accessing URL: {e!s}", url=url, title="", content="", status_code=getattr(e.response, "status_code", 0) if hasattr(e, "response") else 0, - ) \ No newline at end of file + )