diff --git a/src/codegen/extensions/langchain/agent.py b/src/codegen/extensions/langchain/agent.py index 8917daa7f..70106c473 100644 --- a/src/codegen/extensions/langchain/agent.py +++ b/src/codegen/extensions/langchain/agent.py @@ -23,6 +23,8 @@ SearchTool, # SemanticEditTool, ViewFileTool, + WebPageViewTool, + WebSearchTool, ) from .graph import create_react_agent @@ -76,6 +78,8 @@ def create_codebase_agent( ReplacementEditTool(codebase), RelaceEditTool(codebase), ReflectionTool(codebase), + WebSearchTool(codebase), + WebPageViewTool(codebase), # SemanticSearchTool(codebase), # =====[ Github Integration ]===== # Enable Github integration @@ -134,6 +138,8 @@ def create_chat_agent( MoveSymbolTool(codebase), RevealSymbolTool(codebase), RelaceEditTool(codebase), + WebSearchTool(codebase), + WebPageViewTool(codebase), ] if additional_tools: diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index 877b59f05..4ab8da3e4 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -25,6 +25,8 @@ from codegen.extensions.tools.search import search from codegen.extensions.tools.semantic_edit import semantic_edit from codegen.extensions.tools.semantic_search import semantic_search +from codegen.extensions.tools.web_page_view import web_page_view +from codegen.extensions.tools.web_search import web_search from codegen.sdk.core.codebase import Codebase from ..tools import ( @@ -823,6 +825,59 @@ def _run(self, content: str) -> str: return "✅ Message sent successfully" +######################################################################################################################## +# WEB SEARCH +######################################################################################################################## + + +class WebSearchInput(BaseModel): + """Input for web search.""" + + query: str = Field(..., description="The search query") + num_results: int = Field(default=5, description="Number of results to return (default: 5)") + search_engine: str = Field(default="google", description="Search engine to use (default: 'google')") + + +class WebSearchTool(BaseTool): + """Tool for searching the web.""" + + name: ClassVar[str] = "web_search" + description: ClassVar[str] = "Search the web for information and return relevant results" + args_schema: ClassVar[type[BaseModel]] = WebSearchInput + codebase: Codebase = Field(exclude=True) + + def __init__(self, codebase: Codebase) -> None: + super().__init__(codebase=codebase) + + def _run(self, query: str, num_results: int = 5, search_engine: str = "google") -> str: + result = web_search(self.codebase, query, num_results=num_results, search_engine=search_engine) + return result.render() + + +class WebPageViewInput(BaseModel): + """Input for viewing web page content.""" + + url: str = Field(..., description="URL of the web page to view") + selector: Optional[str] = Field(None, description="Optional CSS selector to extract specific content") + max_length: int = Field(default=10000, description="Maximum length of content to return (default: 10000)") + + +class WebPageViewTool(BaseTool): + """Tool for viewing web page content.""" + + name: ClassVar[str] = "web_page_view" + description: ClassVar[str] = "Extract and view content from a web page" + args_schema: ClassVar[type[BaseModel]] = WebPageViewInput + codebase: Codebase = Field(exclude=True) + + def __init__(self, codebase: Codebase) -> None: + super().__init__(codebase=codebase) + + def _run(self, url: str, selector: Optional[str] = None, max_length: int = 10000) -> str: + result = web_page_view(self.codebase, url, selector=selector, max_length=max_length) + return result.render() + + ######################################################################################################################## # EXPORT ######################################################################################################################## @@ -850,11 +905,11 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]: RevealSymbolTool(codebase), RunBashCommandTool(), # Note: This tool doesn't need the codebase SearchTool(codebase), - # SemanticEditTool(codebase), - # SemanticSearchTool(codebase), ViewFileTool(codebase), RelaceEditTool(codebase), ReflectionTool(codebase), + WebSearchTool(codebase), + WebPageViewTool(codebase), # Github GithubCreatePRTool(codebase), GithubCreatePRCommentTool(codebase), diff --git a/src/codegen/extensions/tools/web_page_view.py b/src/codegen/extensions/tools/web_page_view.py new file mode 100644 index 000000000..d7bf7047e --- /dev/null +++ b/src/codegen/extensions/tools/web_page_view.py @@ -0,0 +1,92 @@ +"""Web page viewing tool for the code agent.""" + +from typing import Optional + +import requests +from bs4 import BeautifulSoup + +from codegen.sdk.core.codebase import Codebase +from codegen.sdk.core.result import Result + + +def web_page_view( + codebase: Codebase, + url: str, + selector: Optional[str] = None, + max_length: int = 10000, +) -> Result: + """Extract content from a web page. + + Args: + codebase: The codebase (not used but required for consistency) + url: URL of the web page to view + selector: Optional CSS selector to extract specific content + max_length: Maximum length of content to return (default: 10000) + + Returns: + Result object with web page content + """ + try: + # Set user agent to avoid being blocked + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} + + # Make the request + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + # Parse the HTML + soup = BeautifulSoup(response.text, "html.parser") + + # Remove script and style elements + for script in soup(["script", "style"]): + script.extract() + + # Extract content based on selector if provided + if selector: + content_elements = soup.select(selector) + if not content_elements: + return Result( + success=False, + message=f"No elements found matching selector '{selector}'", + data=None, + ) + content = "\n".join([elem.get_text(strip=True) for elem in content_elements]) + else: + # Get the main content (try common content containers) + main_content = soup.find("main") or soup.find("article") or soup.find("div", {"id": "content"}) or soup.find("div", {"class": "content"}) + + if main_content: + content = main_content.get_text(strip=True) + else: + # Fall back to the entire page text + content = soup.get_text(strip=True) + + # Truncate if too long + if len(content) > max_length: + content = content[:max_length] + "... [content truncated]" + + # Get the page title + title = soup.title.string if soup.title else "Unknown Title" + + return Result( + success=True, + message=f"Successfully extracted content from {url}", + data={ + "url": url, + "title": title, + "content": content, + "content_length": len(content), + }, + ) + except requests.RequestException as e: + return Result( + success=False, + message=f"Error fetching web page: {e!s}", + data=None, + ) + except Exception as e: + return Result( + success=False, + message=f"Error processing web page: {e!s}", + data=None, + ) diff --git a/src/codegen/extensions/tools/web_search.py b/src/codegen/extensions/tools/web_search.py new file mode 100644 index 000000000..b5cced42b --- /dev/null +++ b/src/codegen/extensions/tools/web_search.py @@ -0,0 +1,82 @@ +"""Web search tool for the code agent.""" + +import json +import os + +import requests + +from codegen.sdk.core.codebase import Codebase +from codegen.sdk.core.result import Result + + +def web_search( + codebase: Codebase, + query: str, + num_results: int = 5, + search_engine: str = "google", +) -> Result: + """Search the web for information. + + Args: + codebase: The codebase (not used but required for consistency) + query: The search query + num_results: Number of results to return (default: 5) + search_engine: Search engine to use (default: "google") + + Returns: + Result object with search results + """ + # Get API key from environment variable + api_key = os.environ.get("SERP_API_KEY") + if not api_key: + return Result( + success=False, + message="SERP_API_KEY environment variable not set. Please set it to use the web search tool.", + data=None, + ) + + # Prepare the API request + base_url = "https://serpapi.com/search" + params = { + "q": query, + "api_key": api_key, + "engine": search_engine, + } + + try: + response = requests.get(base_url, params=params) + response.raise_for_status() + data = response.json() + + # Extract organic search results + results = [] + if "organic_results" in data: + for result in data["organic_results"][:num_results]: + results.append( + { + "title": result.get("title", ""), + "link": result.get("link", ""), + "snippet": result.get("snippet", ""), + } + ) + + return Result( + success=True, + message=f"Found {len(results)} results for query: {query}", + data={ + "query": query, + "results": results, + }, + ) + except requests.RequestException as e: + return Result( + success=False, + message=f"Error performing web search: {e!s}", + data=None, + ) + except json.JSONDecodeError: + return Result( + success=False, + message="Error parsing search results", + data=None, + )