From 681c4e993a8c866c1fcbd8986f0c4d000d67cc08 Mon Sep 17 00:00:00 2001 From: codegen-bot Date: Sun, 16 Mar 2025 22:40:28 +0000 Subject: [PATCH 1/2] Add web search and page viewing tools for code agent --- src/codegen/extensions/langchain/agent.py | 8 +- src/codegen/extensions/langchain/tools.py | 61 +++++++++++- src/codegen/extensions/tools/web_page_view.py | 94 +++++++++++++++++++ src/codegen/extensions/tools/web_search.py | 80 ++++++++++++++++ 4 files changed, 239 insertions(+), 4 deletions(-) create mode 100644 src/codegen/extensions/tools/web_page_view.py create mode 100644 src/codegen/extensions/tools/web_search.py diff --git a/src/codegen/extensions/langchain/agent.py b/src/codegen/extensions/langchain/agent.py index 8917daa7f..a8b8909ac 100644 --- a/src/codegen/extensions/langchain/agent.py +++ b/src/codegen/extensions/langchain/agent.py @@ -21,6 +21,8 @@ ReplacementEditTool, RevealSymbolTool, SearchTool, + WebSearchTool, + WebPageViewTool, # SemanticEditTool, ViewFileTool, ) @@ -76,6 +78,8 @@ def create_codebase_agent( ReplacementEditTool(codebase), RelaceEditTool(codebase), ReflectionTool(codebase), + WebSearchTool(codebase), + WebPageViewTool(codebase), # SemanticSearchTool(codebase), # =====[ Github Integration ]===== # Enable Github integration @@ -134,6 +138,8 @@ def create_chat_agent( MoveSymbolTool(codebase), RevealSymbolTool(codebase), RelaceEditTool(codebase), + WebSearchTool(codebase), + WebPageViewTool(codebase), ] if additional_tools: @@ -214,4 +220,4 @@ def create_agent_with_tools( memory = MemorySaver() if memory else None - return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config) + return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config) \ No newline at end of file diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index 877b59f05..4232baba2 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -25,6 +25,8 @@ from codegen.extensions.tools.search import search from codegen.extensions.tools.semantic_edit import semantic_edit from codegen.extensions.tools.semantic_search import semantic_search +from codegen.extensions.tools.web_search import web_search +from codegen.extensions.tools.web_page_view import web_page_view from codegen.sdk.core.codebase import Codebase from ..tools import ( @@ -823,6 +825,59 @@ def _run(self, content: str) -> str: return "✅ Message sent successfully" +######################################################################################################################## +# WEB SEARCH +######################################################################################################################## + + +class WebSearchInput(BaseModel): + """Input for web search.""" + + query: str = Field(..., description="The search query") + num_results: int = Field(default=5, description="Number of results to return (default: 5)") + search_engine: str = Field(default="google", description="Search engine to use (default: 'google')") + + +class WebSearchTool(BaseTool): + """Tool for searching the web.""" + + name: ClassVar[str] = "web_search" + description: ClassVar[str] = "Search the web for information and return relevant results" + args_schema: ClassVar[type[BaseModel]] = WebSearchInput + codebase: Codebase = Field(exclude=True) + + def __init__(self, codebase: Codebase) -> None: + super().__init__(codebase=codebase) + + def _run(self, query: str, num_results: int = 5, search_engine: str = "google") -> str: + result = web_search(self.codebase, query, num_results=num_results, search_engine=search_engine) + return result.render() + + +class WebPageViewInput(BaseModel): + """Input for viewing web page content.""" + + url: str = Field(..., description="URL of the web page to view") + selector: Optional[str] = Field(None, description="Optional CSS selector to extract specific content") + max_length: int = Field(default=10000, description="Maximum length of content to return (default: 10000)") + + +class WebPageViewTool(BaseTool): + """Tool for viewing web page content.""" + + name: ClassVar[str] = "web_page_view" + description: ClassVar[str] = "Extract and view content from a web page" + args_schema: ClassVar[type[BaseModel]] = WebPageViewInput + codebase: Codebase = Field(exclude=True) + + def __init__(self, codebase: Codebase) -> None: + super().__init__(codebase=codebase) + + def _run(self, url: str, selector: Optional[str] = None, max_length: int = 10000) -> str: + result = web_page_view(self.codebase, url, selector=selector, max_length=max_length) + return result.render() + + ######################################################################################################################## # EXPORT ######################################################################################################################## @@ -850,11 +905,11 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]: RevealSymbolTool(codebase), RunBashCommandTool(), # Note: This tool doesn't need the codebase SearchTool(codebase), - # SemanticEditTool(codebase), - # SemanticSearchTool(codebase), ViewFileTool(codebase), RelaceEditTool(codebase), ReflectionTool(codebase), + WebSearchTool(codebase), + WebPageViewTool(codebase), # Github GithubCreatePRTool(codebase), GithubCreatePRCommentTool(codebase), @@ -1023,4 +1078,4 @@ def _run( ) -> str: result = perform_reflection(context_summary=context_summary, findings_so_far=findings_so_far, current_challenges=current_challenges, reflection_focus=reflection_focus, codebase=self.codebase) - return result.render() + return result.render() \ No newline at end of file diff --git a/src/codegen/extensions/tools/web_page_view.py b/src/codegen/extensions/tools/web_page_view.py new file mode 100644 index 000000000..423ca1aac --- /dev/null +++ b/src/codegen/extensions/tools/web_page_view.py @@ -0,0 +1,94 @@ +"""Web page viewing tool for the code agent.""" + +import os +import requests +from bs4 import BeautifulSoup +from typing import Optional + +from codegen.sdk.core.codebase import Codebase +from codegen.sdk.core.result import Result + + +def web_page_view( + codebase: Codebase, + url: str, + selector: Optional[str] = None, + max_length: int = 10000, +) -> Result: + """Extract content from a web page. + + Args: + codebase: The codebase (not used but required for consistency) + url: URL of the web page to view + selector: Optional CSS selector to extract specific content + max_length: Maximum length of content to return (default: 10000) + + Returns: + Result object with web page content + """ + try: + # Set user agent to avoid being blocked + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + + # Make the request + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + # Parse the HTML + soup = BeautifulSoup(response.text, "html.parser") + + # Remove script and style elements + for script in soup(["script", "style"]): + script.extract() + + # Extract content based on selector if provided + if selector: + content_elements = soup.select(selector) + if not content_elements: + return Result( + success=False, + message=f"No elements found matching selector '{selector}'", + data=None, + ) + content = "\n".join([elem.get_text(strip=True) for elem in content_elements]) + else: + # Get the main content (try common content containers) + main_content = soup.find("main") or soup.find("article") or soup.find("div", {"id": "content"}) or soup.find("div", {"class": "content"}) + + if main_content: + content = main_content.get_text(strip=True) + else: + # Fall back to the entire page text + content = soup.get_text(strip=True) + + # Truncate if too long + if len(content) > max_length: + content = content[:max_length] + "... [content truncated]" + + # Get the page title + title = soup.title.string if soup.title else "Unknown Title" + + return Result( + success=True, + message=f"Successfully extracted content from {url}", + data={ + "url": url, + "title": title, + "content": content, + "content_length": len(content), + }, + ) + except requests.RequestException as e: + return Result( + success=False, + message=f"Error fetching web page: {str(e)}", + data=None, + ) + except Exception as e: + return Result( + success=False, + message=f"Error processing web page: {str(e)}", + data=None, + ) \ No newline at end of file diff --git a/src/codegen/extensions/tools/web_search.py b/src/codegen/extensions/tools/web_search.py new file mode 100644 index 000000000..f36832e6f --- /dev/null +++ b/src/codegen/extensions/tools/web_search.py @@ -0,0 +1,80 @@ +"""Web search tool for the code agent.""" + +import json +import os +import requests +from typing import List, Dict, Any, Optional + +from codegen.sdk.core.codebase import Codebase +from codegen.sdk.core.result import Result + + +def web_search( + codebase: Codebase, + query: str, + num_results: int = 5, + search_engine: str = "google", +) -> Result: + """Search the web for information. + + Args: + codebase: The codebase (not used but required for consistency) + query: The search query + num_results: Number of results to return (default: 5) + search_engine: Search engine to use (default: "google") + + Returns: + Result object with search results + """ + # Get API key from environment variable + api_key = os.environ.get("SERP_API_KEY") + if not api_key: + return Result( + success=False, + message="SERP_API_KEY environment variable not set. Please set it to use the web search tool.", + data=None, + ) + + # Prepare the API request + base_url = "https://serpapi.com/search" + params = { + "q": query, + "api_key": api_key, + "engine": search_engine, + } + + try: + response = requests.get(base_url, params=params) + response.raise_for_status() + data = response.json() + + # Extract organic search results + results = [] + if "organic_results" in data: + for result in data["organic_results"][:num_results]: + results.append({ + "title": result.get("title", ""), + "link": result.get("link", ""), + "snippet": result.get("snippet", ""), + }) + + return Result( + success=True, + message=f"Found {len(results)} results for query: {query}", + data={ + "query": query, + "results": results, + }, + ) + except requests.RequestException as e: + return Result( + success=False, + message=f"Error performing web search: {str(e)}", + data=None, + ) + except json.JSONDecodeError: + return Result( + success=False, + message="Error parsing search results", + data=None, + ) \ No newline at end of file From 7d45a3f3a4a409b4bf926ac11996232eb54c238e Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Sun, 16 Mar 2025 22:41:21 +0000 Subject: [PATCH 2/2] Automated pre-commit update --- src/codegen/extensions/langchain/agent.py | 6 ++-- src/codegen/extensions/langchain/tools.py | 4 +-- src/codegen/extensions/tools/web_page_view.py | 34 +++++++++---------- src/codegen/extensions/tools/web_search.py | 30 ++++++++-------- 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/src/codegen/extensions/langchain/agent.py b/src/codegen/extensions/langchain/agent.py index a8b8909ac..70106c473 100644 --- a/src/codegen/extensions/langchain/agent.py +++ b/src/codegen/extensions/langchain/agent.py @@ -21,10 +21,10 @@ ReplacementEditTool, RevealSymbolTool, SearchTool, - WebSearchTool, - WebPageViewTool, # SemanticEditTool, ViewFileTool, + WebPageViewTool, + WebSearchTool, ) from .graph import create_react_agent @@ -220,4 +220,4 @@ def create_agent_with_tools( memory = MemorySaver() if memory else None - return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config) \ No newline at end of file + return create_react_agent(model=llm, tools=tools, system_message=system_message, checkpointer=memory, debug=debug, config=config) diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index 4232baba2..4ab8da3e4 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -25,8 +25,8 @@ from codegen.extensions.tools.search import search from codegen.extensions.tools.semantic_edit import semantic_edit from codegen.extensions.tools.semantic_search import semantic_search -from codegen.extensions.tools.web_search import web_search from codegen.extensions.tools.web_page_view import web_page_view +from codegen.extensions.tools.web_search import web_search from codegen.sdk.core.codebase import Codebase from ..tools import ( @@ -1078,4 +1078,4 @@ def _run( ) -> str: result = perform_reflection(context_summary=context_summary, findings_so_far=findings_so_far, current_challenges=current_challenges, reflection_focus=reflection_focus, codebase=self.codebase) - return result.render() \ No newline at end of file + return result.render() diff --git a/src/codegen/extensions/tools/web_page_view.py b/src/codegen/extensions/tools/web_page_view.py index 423ca1aac..d7bf7047e 100644 --- a/src/codegen/extensions/tools/web_page_view.py +++ b/src/codegen/extensions/tools/web_page_view.py @@ -1,9 +1,9 @@ """Web page viewing tool for the code agent.""" -import os +from typing import Optional + import requests from bs4 import BeautifulSoup -from typing import Optional from codegen.sdk.core.codebase import Codebase from codegen.sdk.core.result import Result @@ -16,33 +16,31 @@ def web_page_view( max_length: int = 10000, ) -> Result: """Extract content from a web page. - + Args: codebase: The codebase (not used but required for consistency) url: URL of the web page to view selector: Optional CSS selector to extract specific content max_length: Maximum length of content to return (default: 10000) - + Returns: Result object with web page content """ try: # Set user agent to avoid being blocked - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - } - + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} + # Make the request response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() - + # Parse the HTML soup = BeautifulSoup(response.text, "html.parser") - + # Remove script and style elements for script in soup(["script", "style"]): script.extract() - + # Extract content based on selector if provided if selector: content_elements = soup.select(selector) @@ -56,20 +54,20 @@ def web_page_view( else: # Get the main content (try common content containers) main_content = soup.find("main") or soup.find("article") or soup.find("div", {"id": "content"}) or soup.find("div", {"class": "content"}) - + if main_content: content = main_content.get_text(strip=True) else: # Fall back to the entire page text content = soup.get_text(strip=True) - + # Truncate if too long if len(content) > max_length: content = content[:max_length] + "... [content truncated]" - + # Get the page title title = soup.title.string if soup.title else "Unknown Title" - + return Result( success=True, message=f"Successfully extracted content from {url}", @@ -83,12 +81,12 @@ def web_page_view( except requests.RequestException as e: return Result( success=False, - message=f"Error fetching web page: {str(e)}", + message=f"Error fetching web page: {e!s}", data=None, ) except Exception as e: return Result( success=False, - message=f"Error processing web page: {str(e)}", + message=f"Error processing web page: {e!s}", data=None, - ) \ No newline at end of file + ) diff --git a/src/codegen/extensions/tools/web_search.py b/src/codegen/extensions/tools/web_search.py index f36832e6f..b5cced42b 100644 --- a/src/codegen/extensions/tools/web_search.py +++ b/src/codegen/extensions/tools/web_search.py @@ -2,8 +2,8 @@ import json import os + import requests -from typing import List, Dict, Any, Optional from codegen.sdk.core.codebase import Codebase from codegen.sdk.core.result import Result @@ -16,13 +16,13 @@ def web_search( search_engine: str = "google", ) -> Result: """Search the web for information. - + Args: codebase: The codebase (not used but required for consistency) query: The search query num_results: Number of results to return (default: 5) search_engine: Search engine to use (default: "google") - + Returns: Result object with search results """ @@ -34,7 +34,7 @@ def web_search( message="SERP_API_KEY environment variable not set. Please set it to use the web search tool.", data=None, ) - + # Prepare the API request base_url = "https://serpapi.com/search" params = { @@ -42,22 +42,24 @@ def web_search( "api_key": api_key, "engine": search_engine, } - + try: response = requests.get(base_url, params=params) response.raise_for_status() data = response.json() - + # Extract organic search results results = [] if "organic_results" in data: for result in data["organic_results"][:num_results]: - results.append({ - "title": result.get("title", ""), - "link": result.get("link", ""), - "snippet": result.get("snippet", ""), - }) - + results.append( + { + "title": result.get("title", ""), + "link": result.get("link", ""), + "snippet": result.get("snippet", ""), + } + ) + return Result( success=True, message=f"Found {len(results)} results for query: {query}", @@ -69,7 +71,7 @@ def web_search( except requests.RequestException as e: return Result( success=False, - message=f"Error performing web search: {str(e)}", + message=f"Error performing web search: {e!s}", data=None, ) except json.JSONDecodeError: @@ -77,4 +79,4 @@ def web_search( success=False, message="Error parsing search results", data=None, - ) \ No newline at end of file + )