From c88ef35fbd1b0f3a89196f331b34d752eee05389 Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Sat, 22 Mar 2025 03:44:20 +0000 Subject: [PATCH 1/2] Add fractional search feature to ripgrep_search tool --- src/codegen/extensions/tools/search.py | 226 ++++++++++++++++++++----- 1 file changed, 183 insertions(+), 43 deletions(-) diff --git a/src/codegen/extensions/tools/search.py b/src/codegen/extensions/tools/search.py index 3f69be59c..87cc05853 100644 --- a/src/codegen/extensions/tools/search.py +++ b/src/codegen/extensions/tools/search.py @@ -175,10 +175,14 @@ def _search_with_ripgrep( page: int = 1, files_per_page: int = 10, use_regex: bool = False, + fractional_search: bool = False, ) -> SearchObservation: """Search the codebase using ripgrep. This is faster than the Python implementation, especially for large codebases. + + If fractional_search is True and the initial search returns no results, + it will automatically split the query into individual words and search for each one. """ # Build ripgrep command cmd = ["rg", "--line-number"] @@ -230,48 +234,121 @@ def _search_with_ripgrep( files_per_page=files_per_page, results=[], ) - - # Parse output lines - for line in result.stdout.splitlines(): - # ripgrep output format: file:line:content - parts = line.split(":", 2) - if len(parts) < 3: - continue - - filepath, line_number_str, content = parts - - # Convert to relative path within the codebase - rel_path = os.path.relpath(filepath, codebase.repo_path) - - try: - line_number = int(line_number_str) - - # Find the actual match text - match_text = query - if use_regex: - # For regex, we need to find what actually matched - # This is a simplification - ideally we'd use ripgrep's --json option - # to get the exact match positions - pattern = re.compile(query) - match_obj = pattern.search(content) - if match_obj: - match_text = match_obj.group(0) - - # Create or append to file results - if rel_path not in all_results: - all_results[rel_path] = [] - - all_results[rel_path].append( - SearchMatch( - status="success", - line_number=line_number, - line=content.strip(), - match=match_text, - ) + + # If no matches found and fractional search is enabled, try searching for individual words + if result.stdout.strip() == "" and fractional_search and " " in query: + logger.info(f"No matches found for '{query}'. Trying fractional search.") + + # Split the query into individual words + words = query.split() + + # Create a combined result from searching for each word + combined_results: dict[str, list[SearchMatch]] = {} + original_query = query # Save the original query + + for word in words: + # Skip very short words (optional, can be adjusted) + if len(word) < 3: + continue + + # Update the command with the new word + cmd[-2] = word + + logger.info(f"Running fractional search with word: '{word}'") + word_result = subprocess.run( + cmd, + capture_output=True, + text=True, + encoding="utf-8", + check=False, ) - except ValueError: - # Skip lines with invalid line numbers - continue + + # Parse output lines for this word + for line in word_result.stdout.splitlines(): + # ripgrep output format: file:line:content + parts = line.split(":", 2) + if len(parts) < 3: + continue + + filepath, line_number_str, content = parts + + # Convert to relative path within the codebase + rel_path = os.path.relpath(filepath, codebase.repo_path) + + try: + line_number = int(line_number_str) + + # Create or append to file results + if rel_path not in combined_results: + combined_results[rel_path] = [] + + # Check if we already have this line for this file + # (to avoid duplicates from different word matches) + line_exists = any( + match.line_number == line_number + for match in combined_results.get(rel_path, []) + ) + + if not line_exists: + combined_results[rel_path].append( + SearchMatch( + status="success", + line_number=line_number, + line=content.strip(), + match=word, # Use the individual word as the match + ) + ) + except ValueError: + # Skip lines with invalid line numbers + continue + + # If we found results with fractional search, use them + if combined_results: + all_results = combined_results + # Note: we keep the original query in the results for clarity + + else: + # Parse output lines from the original search + for line in result.stdout.splitlines(): + # ripgrep output format: file:line:content + parts = line.split(":", 2) + if len(parts) < 3: + continue + + filepath, line_number_str, content = parts + + # Convert to relative path within the codebase + rel_path = os.path.relpath(filepath, codebase.repo_path) + + try: + line_number = int(line_number_str) + + # Find the actual match text + match_text = query + if use_regex: + # For regex, we need to find what actually matched + # This is a simplification - ideally we'd use ripgrep's --json option + # to get the exact match positions + pattern = re.compile(query) + match_obj = pattern.search(content) + if match_obj: + match_text = match_obj.group(0) + + # Create or append to file results + if rel_path not in all_results: + all_results[rel_path] = [] + + all_results[rel_path].append( + SearchMatch( + status="success", + line_number=line_number, + line=content.strip(), + match=match_text, + ) + ) + except ValueError: + # Skip lines with invalid line numbers + continue # Convert to SearchFileResult objects file_results = [] @@ -318,10 +395,14 @@ def _search_with_python( page: int = 1, files_per_page: int = 10, use_regex: bool = False, + fractional_search: bool = False, ) -> SearchObservation: """Search the codebase using Python's regex engine. This is a fallback for when ripgrep is not available. + + If fractional_search is True and the initial search returns no results, + it will automatically split the query into individual words and search for each one. """ # Validate pagination parameters if page < 1: @@ -352,6 +433,8 @@ def _search_with_python( extensions = file_extensions if file_extensions is not None else "*" all_results = [] + + # First try with the full query for file in codebase.files(extensions=extensions): # Skip binary files try: @@ -384,6 +467,60 @@ def _search_with_python( matches=sorted(file_matches, key=lambda x: x.line_number), ) ) + + # If no results found and fractional search is enabled, try with individual words + if not all_results and fractional_search and " " in query and not use_regex: + logger.info(f"No matches found for '{query}'. Trying fractional search.") + + # Split the query into individual words + words = query.split() + + # Dictionary to track files and matches to avoid duplicates + combined_results: dict[str, dict[int, SearchMatch]] = {} + + for word in words: + # Skip very short words (optional, can be adjusted) + if len(word) < 3: + continue + + # Create pattern for this word + word_pattern = re.compile(re.escape(word), re.IGNORECASE) + + for file in codebase.files(extensions=extensions): + # Skip binary files + try: + content = file.content + except ValueError: # File is binary + continue + + # Initialize file in combined results if not present + if file.filepath not in combined_results: + combined_results[file.filepath] = {} + + # Split content into lines and store with line numbers (1-based) + lines = enumerate(content.splitlines(), 1) + + # Search each line for the word pattern + for line_number, line in lines: + match = word_pattern.search(line) + if match and line_number not in combined_results[file.filepath]: + combined_results[file.filepath][line_number] = SearchMatch( + status="success", + line_number=line_number, + line=line.strip(), + match=match.group(0), + ) + + # Convert combined results to the expected format + for filepath, matches_dict in combined_results.items(): + if matches_dict: # Only include files with matches + all_results.append( + SearchFileResult( + status="success", + filepath=filepath, + matches=sorted(matches_dict.values(), key=lambda x: x.line_number), + ) + ) # Sort all results by filepath all_results.sort(key=lambda x: x.filepath) @@ -415,6 +552,7 @@ def search( page: int = 1, files_per_page: int = 10, use_regex: bool = False, + fractional_search: bool = False, ) -> SearchObservation: """Search the codebase using text search or regex pattern matching. @@ -432,13 +570,15 @@ def search( page: Page number to return (1-based, default: 1) files_per_page: Number of files to return per page (default: 10) use_regex: Whether to treat query as a regex pattern (default: False) + fractional_search: Whether to automatically search for individual words if the full + query returns no results (default: False) Returns: SearchObservation containing search results with matches and their sources """ # Try to use ripgrep first try: - return _search_with_ripgrep(codebase, query, file_extensions, page, files_per_page, use_regex) + return _search_with_ripgrep(codebase, query, file_extensions, page, files_per_page, use_regex, fractional_search) except (FileNotFoundError, subprocess.SubprocessError): # Fall back to Python implementation if ripgrep fails or isn't available - return _search_with_python(codebase, query, file_extensions, page, files_per_page, use_regex) + return _search_with_python(codebase, query, file_extensions, page, files_per_page, use_regex, fractional_search) From 2a34709f88ca89f7d72f6d8b4e299ebeb1396b87 Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Sat, 22 Mar 2025 03:45:12 +0000 Subject: [PATCH 2/2] Automated pre-commit update --- src/codegen/extensions/tools/search.py | 51 ++++++++++++-------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/src/codegen/extensions/tools/search.py b/src/codegen/extensions/tools/search.py index 87cc05853..30c9ab038 100644 --- a/src/codegen/extensions/tools/search.py +++ b/src/codegen/extensions/tools/search.py @@ -180,7 +180,7 @@ def _search_with_ripgrep( """Search the codebase using ripgrep. This is faster than the Python implementation, especially for large codebases. - + If fractional_search is True and the initial search returns no results, it will automatically split the query into individual words and search for each one. """ @@ -234,26 +234,26 @@ def _search_with_ripgrep( files_per_page=files_per_page, results=[], ) - + # If no matches found and fractional search is enabled, try searching for individual words if result.stdout.strip() == "" and fractional_search and " " in query: logger.info(f"No matches found for '{query}'. Trying fractional search.") - + # Split the query into individual words words = query.split() - + # Create a combined result from searching for each word combined_results: dict[str, list[SearchMatch]] = {} original_query = query # Save the original query - + for word in words: # Skip very short words (optional, can be adjusted) if len(word) < 3: continue - + # Update the command with the new word cmd[-2] = word - + logger.info(f"Running fractional search with word: '{word}'") word_result = subprocess.run( cmd, @@ -262,7 +262,7 @@ def _search_with_ripgrep( encoding="utf-8", check=False, ) - + # Parse output lines for this word for line in word_result.stdout.splitlines(): # ripgrep output format: file:line:content @@ -284,11 +284,8 @@ def _search_with_ripgrep( # Check if we already have this line for this file # (to avoid duplicates from different word matches) - line_exists = any( - match.line_number == line_number - for match in combined_results.get(rel_path, []) - ) - + line_exists = any(match.line_number == line_number for match in combined_results.get(rel_path, [])) + if not line_exists: combined_results[rel_path].append( SearchMatch( @@ -301,12 +298,12 @@ def _search_with_ripgrep( except ValueError: # Skip lines with invalid line numbers continue - + # If we found results with fractional search, use them if combined_results: all_results = combined_results # Note: we keep the original query in the results for clarity - + else: # Parse output lines from the original search for line in result.stdout.splitlines(): @@ -400,7 +397,7 @@ def _search_with_python( """Search the codebase using Python's regex engine. This is a fallback for when ripgrep is not available. - + If fractional_search is True and the initial search returns no results, it will automatically split the query into individual words and search for each one. """ @@ -433,7 +430,7 @@ def _search_with_python( extensions = file_extensions if file_extensions is not None else "*" all_results = [] - + # First try with the full query for file in codebase.files(extensions=extensions): # Skip binary files @@ -467,39 +464,39 @@ def _search_with_python( matches=sorted(file_matches, key=lambda x: x.line_number), ) ) - + # If no results found and fractional search is enabled, try with individual words if not all_results and fractional_search and " " in query and not use_regex: logger.info(f"No matches found for '{query}'. Trying fractional search.") - + # Split the query into individual words words = query.split() - + # Dictionary to track files and matches to avoid duplicates combined_results: dict[str, dict[int, SearchMatch]] = {} - + for word in words: # Skip very short words (optional, can be adjusted) if len(word) < 3: continue - + # Create pattern for this word word_pattern = re.compile(re.escape(word), re.IGNORECASE) - + for file in codebase.files(extensions=extensions): # Skip binary files try: content = file.content except ValueError: # File is binary continue - + # Initialize file in combined results if not present if file.filepath not in combined_results: combined_results[file.filepath] = {} - + # Split content into lines and store with line numbers (1-based) lines = enumerate(content.splitlines(), 1) - + # Search each line for the word pattern for line_number, line in lines: match = word_pattern.search(line) @@ -510,7 +507,7 @@ def _search_with_python( line=line.strip(), match=match.group(0), ) - + # Convert combined results to the expected format for filepath, matches_dict in combined_results.items(): if matches_dict: # Only include files with matches