Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/codegen/extensions/langchain/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
SearchTool,
# SemanticEditTool,
ViewFileTool,
WebBrowserTool,
)

from .graph import create_react_agent
Expand Down Expand Up @@ -134,6 +135,7 @@ def create_chat_agent(
MoveSymbolTool(codebase),
RevealSymbolTool(codebase),
RelaceEditTool(codebase),
WebBrowserTool(codebase),
]

if additional_tools:
Expand Down
45 changes: 43 additions & 2 deletions src/codegen/extensions/langchain/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from codegen.extensions.tools.search import search
from codegen.extensions.tools.semantic_edit import semantic_edit
from codegen.extensions.tools.semantic_search import semantic_search
from codegen.extensions.tools.web_browser import browse_web
from codegen.sdk.core.codebase import Codebase

from ..tools import (
Expand Down Expand Up @@ -850,11 +851,10 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]:
RevealSymbolTool(codebase),
RunBashCommandTool(), # Note: This tool doesn't need the codebase
SearchTool(codebase),
# SemanticEditTool(codebase),
# SemanticSearchTool(codebase),
ViewFileTool(codebase),
RelaceEditTool(codebase),
ReflectionTool(codebase),
WebBrowserTool(codebase),
# Github
GithubCreatePRTool(codebase),
GithubCreatePRCommentTool(codebase),
Expand Down Expand Up @@ -1024,3 +1024,44 @@ def _run(
result = perform_reflection(context_summary=context_summary, findings_so_far=findings_so_far, current_challenges=current_challenges, reflection_focus=reflection_focus, codebase=self.codebase)

return result.render()


class WebBrowserInput(BaseModel):
"""Input for web browser tool."""

url: str = Field(..., description="URL to browse (must include http:// or https://)")
extract_text_only: bool = Field(default=True, description="Whether to extract only text content (True) or include HTML (False)")
timeout: int = Field(default=10, description="Request timeout in seconds")
max_content_length: int = Field(default=10000, description="Maximum content length to return")


class WebBrowserTool(BaseTool):
"""Tool for browsing websites and extracting content."""

name: ClassVar[str] = "browse_web"
description: ClassVar[str] = """
Browse a website and extract its content.
This tool allows you to access web pages, retrieve information, and analyze online content.
Useful for researching documentation, checking references, or gathering information from websites.
"""
args_schema: ClassVar[type[BaseModel]] = WebBrowserInput
codebase: Codebase = Field(exclude=True)

def __init__(self, codebase: Codebase) -> None:
super().__init__(codebase=codebase)

def _run(
self,
url: str,
extract_text_only: bool = True,
timeout: int = 10,
max_content_length: int = 10000,
) -> str:
result = browse_web(
self.codebase,
url=url,
extract_text_only=extract_text_only,
timeout=timeout,
max_content_length=max_content_length,
)
return result.render()
3 changes: 3 additions & 0 deletions src/codegen/extensions/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,11 @@
from .semantic_edit import semantic_edit
from .semantic_search import semantic_search
from .view_file import view_file
from .web_browser import browse_web

__all__ = [
# Web operations
"browse_web",
# Git operations
"commit",
# File operations
Expand Down
142 changes: 142 additions & 0 deletions src/codegen/extensions/tools/web_browser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""Web browser tool for accessing web content.

This tool allows fetching web pages and extracting their content,
enabling Codegen to browse websites and retrieve information.
"""

from typing import ClassVar
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup
from pydantic import Field

from codegen.sdk.core.codebase import Codebase

from .observation import Observation


class WebBrowserObservation(Observation):
"""Response from browsing a web page."""

url: str = Field(
description="The URL that was accessed",
)
title: str = Field(
description="The title of the web page",
)
content: str = Field(
description="The extracted content from the web page",
)
status_code: int = Field(
description="HTTP status code of the response",
)

str_template: ClassVar[str] = "Browsed {url} (Status: {status_code})"

def render(self) -> str:
"""Render web browser results in a readable format."""
if self.status == "error":
return f"[WEB BROWSER ERROR]: {self.error}"

lines = [
f"[WEB PAGE]: {self.url}",
f"Status: {self.status_code}",
f"Title: {self.title}",
"",
"Content:",
"----------",
self.content[:2000] + ("..." if len(self.content) > 2000 else ""),
"----------",
]
return "\n".join(lines)


def browse_web(
codebase: Codebase,
url: str,
extract_text_only: bool = True,
timeout: int = 10,
max_content_length: int = 10000,
) -> WebBrowserObservation:
"""Browse a web page and extract its content.

Args:
codebase: The codebase to operate on (not used but required for tool interface)
url: The URL to browse
extract_text_only: Whether to extract only text content (default: True)
timeout: Request timeout in seconds (default: 10)
max_content_length: Maximum content length to return (default: 10000)

Returns:
WebBrowserObservation containing the web page content and metadata
"""
# Validate URL
parsed_url = urlparse(url)
if not parsed_url.scheme or not parsed_url.netloc:
return WebBrowserObservation(
status="error",
error=f"Invalid URL: {url}. Must include scheme (http:// or https://) and domain.",
url=url,
title="",
content="",
status_code=0,
)

# Add scheme if missing
if not parsed_url.scheme:
url = f"https://{url}"

try:
# Set user agent to avoid being blocked
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}

# Make the request
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()

# Parse the HTML
soup = BeautifulSoup(response.text, "html.parser")

# Extract title
title = soup.title.string if soup.title else "No title found"

# Extract content based on preference
if extract_text_only:
# Remove script and style elements
for script in soup(["script", "style"]):
script.extract()

# Get text and clean it up
text = soup.get_text()
# Break into lines and remove leading/trailing space
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Remove blank lines
content = "\n".join(chunk for chunk in chunks if chunk)
else:
# Return simplified HTML
content = str(soup)

# Truncate content if too long
if len(content) > max_content_length:
content = content[:max_content_length] + "... (content truncated)"

return WebBrowserObservation(
status="success",
url=url,
title=title,
content=content,
status_code=response.status_code,
)

except requests.exceptions.RequestException as e:
return WebBrowserObservation(
status="error",
error=f"Error accessing URL: {e!s}",
url=url,
title="",
content="",
status_code=getattr(e.response, "status_code", 0) if hasattr(e, "response") else 0,
)
Loading