From 66c7bfba04a299b3597ec8694a93342426dff8a9 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Wed, 4 Feb 2026 17:03:35 +0000 Subject: [PATCH] Optimize get_analyzer_for_file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimized code achieves a **31% runtime improvement** by introducing `@lru_cache` to cache `TreeSitterAnalyzer` instances based on file extensions, eliminating redundant object creation. ## Key Optimization **Added LRU caching**: The new `_analyzer_for_suffix()` helper function uses `@lru_cache(maxsize=16)` to cache analyzer instances. When the same file extension is encountered multiple times, the cached analyzer is returned instead of creating a new `TreeSitterAnalyzer` object. ## Why This Improves Runtime 1. **Eliminates repeated object instantiation**: The original code created a new `TreeSitterAnalyzer` every time `get_analyzer_for_file()` was called, even for the same file type. Line profiler shows that in the original version, `TreeSitterAnalyzer.__init__` was called **1,082 times**, consuming 1.15ms. In the optimized version, it's only called **38 times** (cache misses), consuming just 55μs - a **95% reduction**. 2. **Fast dictionary lookup vs object creation**: The LRU cache uses a fast dictionary lookup (O(1)) to return cached analyzers. This is significantly faster than the original flow which required: - Creating a new object - Running `isinstance()` check - Assigning attributes (`self.language`, `self._parser`) 3. **Reduced memory allocation overhead**: Each new `TreeSitterAnalyzer` instance requires memory allocation and initialization. Reusing cached instances eliminates this overhead for repeated file extensions. ## Impact on Hot Path Usage The function references show `get_analyzer_for_file()` is called extensively in test discovery code across multiple test files. The function is invoked **within loops** for processing JavaScript/TypeScript test files, making it a hot path. For example: - Processing 100+ files in `test_multiple_ts_files_consistent_results` - Called repeatedly in test batches and nested loops Since the same file extensions (.ts, .tsx, .js) are processed repeatedly in these loops, the cache hit rate is very high, maximizing the optimization's benefit. ## Test Case Performance The annotated tests confirm this optimization excels when: - **Processing the same extension multiple times**: Tests like `test_multiple_ts_files_consistent_results` show 33.8% speedup - **Common extensions** (.ts, .tsx, .js): 35-48% faster on individual calls - **Batch operations**: Processing lists of files with repeated extensions sees consistent 30-40% improvements Edge cases with uncommon extensions (.txt, .py) may show slight regression (12-19% slower) due to cache lookup overhead, but these are rare in practice given the function's usage for JavaScript/TypeScript file analysis. --- codeflash/languages/treesitter_utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/codeflash/languages/treesitter_utils.py b/codeflash/languages/treesitter_utils.py index 8125161c1..10608d104 100644 --- a/codeflash/languages/treesitter_utils.py +++ b/codeflash/languages/treesitter_utils.py @@ -9,6 +9,7 @@ import logging from dataclasses import dataclass from enum import Enum +from functools import lru_cache from typing import TYPE_CHECKING from tree_sitter import Language, Parser @@ -1579,7 +1580,24 @@ def get_analyzer_for_file(file_path: Path) -> TreeSitterAnalyzer: """ suffix = file_path.suffix.lower() + return _analyzer_for_suffix(suffix) + + +@lru_cache(maxsize=16) +def _analyzer_for_suffix(suffix: str) -> TreeSitterAnalyzer: + """Return a cached TreeSitterAnalyzer for a lowercase suffix.""" + if suffix in (".ts",): # noqa: FURB171 + return TreeSitterAnalyzer(TreeSitterLanguage.TYPESCRIPT) + if suffix in (".tsx",): # noqa: FURB171 + return TreeSitterAnalyzer(TreeSitterLanguage.TSX) + # Default to JavaScript for .js, .jsx, .mjs, .cjs + return TreeSitterAnalyzer(TreeSitterLanguage.JAVASCRIPT) + + +@lru_cache(maxsize=16) +def _analyzer_for_suffix(suffix: str) -> TreeSitterAnalyzer: + """Return a cached TreeSitterAnalyzer for a lowercase suffix.""" if suffix in (".ts",): # noqa: FURB171 return TreeSitterAnalyzer(TreeSitterLanguage.TYPESCRIPT) if suffix in (".tsx",): # noqa: FURB171