From f3808272e4044b21dbb7f657c1688d060f116401 Mon Sep 17 00:00:00 2001
From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com>
Date: Sat, 22 Mar 2025 03:49:01 +0000
Subject: [PATCH 1/2] Add fractional search feature to ripgrep_search tool

---
 src/codegen/extensions/langchain/tools.py |  11 +-
 src/codegen/extensions/tools/search.py    | 264 ++++++++++++++++------
 2 files changed, 200 insertions(+), 75 deletions(-)

diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py
index 0749384a4..712a994bb 100644
--- a/src/codegen/extensions/langchain/tools.py
+++ b/src/codegen/extensions/langchain/tools.py
@@ -126,19 +126,21 @@ class SearchInput(BaseModel):
 
     query: str = Field(
         ...,
-        description="""ripgrep query (or regex pattern) to run. For regex searches, set use_regex=True. Ripgrep is the preferred method.""",
+        description="""The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True.
+        Ripgrep is the preferred method.""",
     )
     file_extensions: list[str] | None = Field(default=None, description="Optional list of file extensions to search (e.g. ['.py', '.ts'])")
     page: int = Field(default=1, description="Page number to return (1-based, default: 1)")
     files_per_page: int = Field(default=10, description="Number of files to return per page (default: 10)")
     use_regex: bool = Field(default=False, description="Whether to treat query as a regex pattern (default: False)")
+    fractional_search: bool = Field(default=False, description="Whether to search for individual words if full query returns no results (default: False)")
     tool_call_id: Annotated[str, InjectedToolCallId]
 
 
 class RipGrepTool(BaseTool):
     """Tool for searching the codebase via RipGrep."""
 
-    name: ClassVar[str] = "search"
+    name: ClassVar[str] = "ripgrep_search"
     description: ClassVar[str] = "Search the codebase using `ripgrep` or regex pattern matching"
     args_schema: ClassVar[type[BaseModel]] = SearchInput
     codebase: Codebase = Field(exclude=True)
@@ -146,8 +148,8 @@ class RipGrepTool(BaseTool):
     def __init__(self, codebase: Codebase) -> None:
         super().__init__(codebase=codebase)
 
-    def _run(self, tool_call_id: str, query: str, file_extensions: Optional[list[str]] = None, page: int = 1, files_per_page: int = 10, use_regex: bool = False) -> ToolMessage:
-        result = search(self.codebase, query, file_extensions=file_extensions, page=page, files_per_page=files_per_page, use_regex=use_regex)
+    def _run(self, tool_call_id: str, query: str, file_extensions: Optional[list[str]] = None, page: int = 1, files_per_page: int = 10, use_regex: bool = False, fractional_search: bool = False) -> ToolMessage:
+        result = search(self.codebase, query, file_extensions=file_extensions, page=page, files_per_page=files_per_page, use_regex=use_regex, fractional_search=fractional_search)
         return result.render(tool_call_id)
 
 
@@ -1128,3 +1130,4 @@ def __init__(self, codebase: Codebase):
     def _run(self, pattern: str, page: int = 1, files_per_page: int | float = 10) -> str:
         """Execute the glob pattern search using fd."""
         return search_files_by_name(self.codebase, pattern, page=page, files_per_page=files_per_page).render()
+
diff --git a/src/codegen/extensions/tools/search.py b/src/codegen/extensions/tools/search.py
index 3f69be59c..6fa2a7de5 100644
--- a/src/codegen/extensions/tools/search.py
+++ b/src/codegen/extensions/tools/search.py
@@ -175,10 +175,13 @@ def _search_with_ripgrep(
     page: int = 1,
     files_per_page: int = 10,
     use_regex: bool = False,
+    fractional_search: bool = False,
 ) -> SearchObservation:
     """Search the codebase using ripgrep.
 
     This is faster than the Python implementation, especially for large codebases.
+    If fractional_search is True and no results are found for the full query,
+    it will automatically search for individual words from the query.
     """
     # Build ripgrep command
     cmd = ["rg", "--line-number"]
@@ -231,47 +234,108 @@ def _search_with_ripgrep(
                 results=[],
             )
 
-        # Parse output lines
-        for line in result.stdout.splitlines():
-            # ripgrep output format: file:line:content
-            parts = line.split(":", 2)
-            if len(parts) < 3:
-                continue
-
-            filepath, line_number_str, content = parts
-
-            # Convert to relative path within the codebase
-            rel_path = os.path.relpath(filepath, codebase.repo_path)
-
-            try:
-                line_number = int(line_number_str)
-
-                # Find the actual match text
-                match_text = query
-                if use_regex:
-                    # For regex, we need to find what actually matched
-                    # This is a simplification - ideally we'd use ripgrep's --json option
-                    # to get the exact match positions
-                    pattern = re.compile(query)
-                    match_obj = pattern.search(content)
-                    if match_obj:
-                        match_text = match_obj.group(0)
-
-                # Create or append to file results
-                if rel_path not in all_results:
-                    all_results[rel_path] = []
-
-                all_results[rel_path].append(
-                    SearchMatch(
-                        status="success",
-                        line_number=line_number,
-                        line=content.strip(),
-                        match=match_text,
-                    )
+        # Check if we got no results and fractional search is enabled
+        if result.stdout.strip() == "" and fractional_search and " " in query:
+            logger.info(f"No results found for '{query}', trying fractional search")
+            # Split the query into individual words
+            words = query.split()
+            # Filter out very short words (less than 3 characters)
+            words = [word for word in words if len(word) >= 3]
+            
+            # Search for each word individually and combine results
+            for word in words:
+                word_cmd = cmd.copy()
+                # Replace the query with the individual word
+                word_cmd[-2] = word
+                
+                logger.info(f"Running fractional search with: {' '.join(word_cmd)}")
+                word_result = subprocess.run(
+                    word_cmd,
+                    capture_output=True,
+                    text=True,
+                    encoding="utf-8",
+                    check=False,
                 )
-            except ValueError:
-                # Skip lines with invalid line numbers
-                continue
+                
+                # Parse output lines for this word
+                for line in word_result.stdout.splitlines():
+                    # ripgrep output format: file:line:content
+                    parts = line.split(":", 2)
+                    if len(parts) < 3:
+                        continue
+
+                    filepath, line_number_str, content = parts
+
+                    # Convert to relative path within the codebase
+                    rel_path = os.path.relpath(filepath, codebase.repo_path)
+
+                    try:
+                        line_number = int(line_number_str)
+
+                        # Create or append to file results
+                        if rel_path not in all_results:
+                            all_results[rel_path] = []
+
+                        # Check if this line is already in the results to avoid duplicates
+                        line_exists = any(
+                            match.line_number == line_number and match.line.strip() == content.strip()
+                            for match in all_results[rel_path]
+                        )
+                        
+                        if not line_exists:
+                            all_results[rel_path].append(
+                                SearchMatch(
+                                    status="success",
+                                    line_number=line_number,
+                                    line=content.strip(),
+                                    match=word,
+                                )
+                            )
+                    except ValueError:
+                        # Skip lines with invalid line numbers
+                        continue
+        else:
+            # Parse output lines from the original search
+            for line in result.stdout.splitlines():
+                # ripgrep output format: file:line:content
+                parts = line.split(":", 2)
+                if len(parts) < 3:
+                    continue
+
+                filepath, line_number_str, content = parts
+
+                # Convert to relative path within the codebase
+                rel_path = os.path.relpath(filepath, codebase.repo_path)
+
+                try:
+                    line_number = int(line_number_str)
+
+                    # Find the actual match text
+                    match_text = query
+                    if use_regex:
+                        # For regex, we need to find what actually matched
+                        # This is a simplification - ideally we'd use ripgrep's --json option
+                        # to get the exact match positions
+                        pattern = re.compile(query)
+                        match_obj = pattern.search(content)
+                        if match_obj:
+                            match_text = match_obj.group(0)
+
+                    # Create or append to file results
+                    if rel_path not in all_results:
+                        all_results[rel_path] = []
+
+                    all_results[rel_path].append(
+                        SearchMatch(
+                            status="success",
+                            line_number=line_number,
+                            line=content.strip(),
+                            match=match_text,
+                        )
+                    )
+                except ValueError:
+                    # Skip lines with invalid line numbers
+                    continue
 
         # Convert to SearchFileResult objects
         file_results = []
@@ -318,10 +382,13 @@ def _search_with_python(
     page: int = 1,
     files_per_page: int = 10,
     use_regex: bool = False,
+    fractional_search: bool = False,
 ) -> SearchObservation:
     """Search the codebase using Python's regex engine.
 
     This is a fallback for when ripgrep is not available.
+    If fractional_search is True and no results are found for the full query,
+    it will automatically search for individual words from the query.
     """
     # Validate pagination parameters
     if page < 1:
@@ -352,38 +419,88 @@ def _search_with_python(
     extensions = file_extensions if file_extensions is not None else "*"
 
     all_results = []
-    for file in codebase.files(extensions=extensions):
-        # Skip binary files
-        try:
-            content = file.content
-        except ValueError:  # File is binary
-            continue
-
-        file_matches = []
-        # Split content into lines and store with line numbers (1-based)
-        lines = enumerate(content.splitlines(), 1)
-
-        # Search each line for the pattern
-        for line_number, line in lines:
-            match = pattern.search(line)
-            if match:
-                file_matches.append(
-                    SearchMatch(
+    
+    # Function to search files with a given pattern
+    def search_files_with_pattern(search_pattern, match_text):
+        file_results = []
+        for file in codebase.files(extensions=extensions):
+            # Skip binary files
+            try:
+                content = file.content
+            except ValueError:  # File is binary
+                continue
+
+            file_matches = []
+            # Split content into lines and store with line numbers (1-based)
+            lines = enumerate(content.splitlines(), 1)
+
+            # Search each line for the pattern
+            for line_number, line in lines:
+                match = search_pattern.search(line)
+                if match:
+                    # Check if this match is already in the results to avoid duplicates
+                    match_exists = any(
+                        m.line_number == line_number and m.line.strip() == line.strip()
+                        for m in file_matches
+                    )
+                    
+                    if not match_exists:
+                        file_matches.append(
+                            SearchMatch(
+                                status="success",
+                                line_number=line_number,
+                                line=line.strip(),
+                                match=match_text if match_text else match.group(0),
+                            )
+                        )
+
+            if file_matches:
+                file_results.append(
+                    SearchFileResult(
                         status="success",
-                        line_number=line_number,
-                        line=line.strip(),
-                        match=match.group(0),
+                        filepath=file.filepath,
+                        matches=sorted(file_matches, key=lambda x: x.line_number),
                     )
                 )
-
-        if file_matches:
-            all_results.append(
-                SearchFileResult(
-                    status="success",
-                    filepath=file.filepath,
-                    matches=sorted(file_matches, key=lambda x: x.line_number),
-                )
-            )
+        return file_results
+
+    # First try with the full query
+    all_results = search_files_with_pattern(pattern, query if not use_regex else None)
+    
+    # If no results and fractional search is enabled, try individual words
+    if not all_results and fractional_search and " " in query and not use_regex:
+        logger.info(f"No results found for '{query}', trying fractional search")
+        # Split the query into individual words
+        words = query.split()
+        # Filter out very short words (less than 3 characters)
+        words = [word for word in words if len(word) >= 3]
+        
+        # Search for each word individually
+        for word in words:
+            word_pattern = re.compile(re.escape(word), re.IGNORECASE)
+            word_results = search_files_with_pattern(word_pattern, word)
+            
+            # Merge results
+            for word_result in word_results:
+                # Check if this file is already in all_results
+                existing_file = next((r for r in all_results if r.filepath == word_result.filepath), None)
+                
+                if existing_file:
+                    # Merge matches, avoiding duplicates
+                    for match in word_result.matches:
+                        match_exists = any(
+                            m.line_number == match.line_number and m.line.strip() == match.line.strip()
+                            for m in existing_file.matches
+                        )
+                        
+                        if not match_exists:
+                            existing_file.matches.append(match)
+                    
+                    # Re-sort matches by line number
+                    existing_file.matches.sort(key=lambda x: x.line_number)
+                else:
+                    # Add new file result
+                    all_results.append(word_result)
 
     # Sort all results by filepath
     all_results.sort(key=lambda x: x.filepath)
@@ -415,6 +532,7 @@ def search(
     page: int = 1,
     files_per_page: int = 10,
     use_regex: bool = False,
+    fractional_search: bool = False,
 ) -> SearchObservation:
     """Search the codebase using text search or regex pattern matching.
 
@@ -423,6 +541,9 @@ def search(
     Otherwise, performs a case-insensitive text search.
     Returns matching lines with their line numbers, grouped by file.
     Results are paginated by files, with a default of 10 files per page.
+    
+    If fractional_search is True and no results are found for the full query,
+    it will automatically search for individual words from the query.
 
     Args:
         codebase: The codebase to operate on
@@ -432,13 +553,14 @@ def search(
         page: Page number to return (1-based, default: 1)
         files_per_page: Number of files to return per page (default: 10)
         use_regex: Whether to treat query as a regex pattern (default: False)
+        fractional_search: Whether to search for individual words if full query returns no results (default: False)
 
     Returns:
         SearchObservation containing search results with matches and their sources
     """
     # Try to use ripgrep first
     try:
-        return _search_with_ripgrep(codebase, query, file_extensions, page, files_per_page, use_regex)
+        return _search_with_ripgrep(codebase, query, file_extensions, page, files_per_page, use_regex, fractional_search)
     except (FileNotFoundError, subprocess.SubprocessError):
         # Fall back to Python implementation if ripgrep fails or isn't available
-        return _search_with_python(codebase, query, file_extensions, page, files_per_page, use_regex)
+        return _search_with_python(codebase, query, file_extensions, page, files_per_page, use_regex, fractional_search)

From e13f69e98b896c47b96877d1ed3d6188970b0e9e Mon Sep 17 00:00:00 2001
From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com>
Date: Sat, 22 Mar 2025 03:49:46 +0000
Subject: [PATCH 2/2] Automated pre-commit update

---
 src/codegen/extensions/langchain/tools.py |  5 +--
 src/codegen/extensions/tools/search.py    | 41 +++++++++--------------
 2 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py
index 712a994bb..bcbfaa127 100644
--- a/src/codegen/extensions/langchain/tools.py
+++ b/src/codegen/extensions/langchain/tools.py
@@ -148,7 +148,9 @@ class RipGrepTool(BaseTool):
     def __init__(self, codebase: Codebase) -> None:
         super().__init__(codebase=codebase)
 
-    def _run(self, tool_call_id: str, query: str, file_extensions: Optional[list[str]] = None, page: int = 1, files_per_page: int = 10, use_regex: bool = False, fractional_search: bool = False) -> ToolMessage:
+    def _run(
+        self, tool_call_id: str, query: str, file_extensions: Optional[list[str]] = None, page: int = 1, files_per_page: int = 10, use_regex: bool = False, fractional_search: bool = False
+    ) -> ToolMessage:
         result = search(self.codebase, query, file_extensions=file_extensions, page=page, files_per_page=files_per_page, use_regex=use_regex, fractional_search=fractional_search)
         return result.render(tool_call_id)
 
@@ -1130,4 +1132,3 @@ def __init__(self, codebase: Codebase):
     def _run(self, pattern: str, page: int = 1, files_per_page: int | float = 10) -> str:
         """Execute the glob pattern search using fd."""
         return search_files_by_name(self.codebase, pattern, page=page, files_per_page=files_per_page).render()
-
diff --git a/src/codegen/extensions/tools/search.py b/src/codegen/extensions/tools/search.py
index 6fa2a7de5..4593af9cf 100644
--- a/src/codegen/extensions/tools/search.py
+++ b/src/codegen/extensions/tools/search.py
@@ -241,13 +241,13 @@ def _search_with_ripgrep(
             words = query.split()
             # Filter out very short words (less than 3 characters)
             words = [word for word in words if len(word) >= 3]
-            
+
             # Search for each word individually and combine results
             for word in words:
                 word_cmd = cmd.copy()
                 # Replace the query with the individual word
                 word_cmd[-2] = word
-                
+
                 logger.info(f"Running fractional search with: {' '.join(word_cmd)}")
                 word_result = subprocess.run(
                     word_cmd,
@@ -256,7 +256,7 @@ def _search_with_ripgrep(
                     encoding="utf-8",
                     check=False,
                 )
-                
+
                 # Parse output lines for this word
                 for line in word_result.stdout.splitlines():
                     # ripgrep output format: file:line:content
@@ -277,11 +277,8 @@ def _search_with_ripgrep(
                             all_results[rel_path] = []
 
                         # Check if this line is already in the results to avoid duplicates
-                        line_exists = any(
-                            match.line_number == line_number and match.line.strip() == content.strip()
-                            for match in all_results[rel_path]
-                        )
-                        
+                        line_exists = any(match.line_number == line_number and match.line.strip() == content.strip() for match in all_results[rel_path])
+
                         if not line_exists:
                             all_results[rel_path].append(
                                 SearchMatch(
@@ -419,7 +416,7 @@ def _search_with_python(
     extensions = file_extensions if file_extensions is not None else "*"
 
     all_results = []
-    
+
     # Function to search files with a given pattern
     def search_files_with_pattern(search_pattern, match_text):
         file_results = []
@@ -439,11 +436,8 @@ def search_files_with_pattern(search_pattern, match_text):
                 match = search_pattern.search(line)
                 if match:
                     # Check if this match is already in the results to avoid duplicates
-                    match_exists = any(
-                        m.line_number == line_number and m.line.strip() == line.strip()
-                        for m in file_matches
-                    )
-                    
+                    match_exists = any(m.line_number == line_number and m.line.strip() == line.strip() for m in file_matches)
+
                     if not match_exists:
                         file_matches.append(
                             SearchMatch(
@@ -466,7 +460,7 @@ def search_files_with_pattern(search_pattern, match_text):
 
     # First try with the full query
     all_results = search_files_with_pattern(pattern, query if not use_regex else None)
-    
+
     # If no results and fractional search is enabled, try individual words
     if not all_results and fractional_search and " " in query and not use_regex:
         logger.info(f"No results found for '{query}', trying fractional search")
@@ -474,28 +468,25 @@ def search_files_with_pattern(search_pattern, match_text):
         words = query.split()
         # Filter out very short words (less than 3 characters)
         words = [word for word in words if len(word) >= 3]
-        
+
         # Search for each word individually
         for word in words:
             word_pattern = re.compile(re.escape(word), re.IGNORECASE)
             word_results = search_files_with_pattern(word_pattern, word)
-            
+
             # Merge results
             for word_result in word_results:
                 # Check if this file is already in all_results
                 existing_file = next((r for r in all_results if r.filepath == word_result.filepath), None)
-                
+
                 if existing_file:
                     # Merge matches, avoiding duplicates
                     for match in word_result.matches:
-                        match_exists = any(
-                            m.line_number == match.line_number and m.line.strip() == match.line.strip()
-                            for m in existing_file.matches
-                        )
-                        
+                        match_exists = any(m.line_number == match.line_number and m.line.strip() == match.line.strip() for m in existing_file.matches)
+
                         if not match_exists:
                             existing_file.matches.append(match)
-                    
+
                     # Re-sort matches by line number
                     existing_file.matches.sort(key=lambda x: x.line_number)
                 else:
@@ -541,7 +532,7 @@ def search(
     Otherwise, performs a case-insensitive text search.
     Returns matching lines with their line numbers, grouped by file.
     Results are paginated by files, with a default of 10 files per page.
-    
+
     If fractional_search is True and no results are found for the full query,
     it will automatically search for individual words from the query.