From 55abfd8b59a708c59fc84676385b75b5a9d5212d Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Mon, 12 May 2025 12:34:43 +0000 Subject: [PATCH 01/16] ZAM-368: Add diff_lite.py implementation to analyzers directory --- .../codegen_on_oss/analyzers/__init__.py | 9 +- .../codegen_on_oss/analyzers/diff_lite.py | 162 ++++++++++++++++++ .../tests/unit/analyzers/test_diff_lite.py | 131 ++++++++++++++ 3 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 codegen-on-oss/codegen_on_oss/analyzers/diff_lite.py create mode 100644 codegen-on-oss/tests/unit/analyzers/test_diff_lite.py diff --git a/codegen-on-oss/codegen_on_oss/analyzers/__init__.py b/codegen-on-oss/codegen_on_oss/analyzers/__init__.py index f1ef5c5b4..80f1eab6d 100644 --- a/codegen-on-oss/codegen_on_oss/analyzers/__init__.py +++ b/codegen-on-oss/codegen_on_oss/analyzers/__init__.py @@ -47,6 +47,9 @@ from codegen_on_oss.analyzers.code_quality import CodeQualityAnalyzer from codegen_on_oss.analyzers.dependencies import DependencyAnalyzer +# Diff tracking +from codegen_on_oss.analyzers.diff_lite import ChangeType, DiffLite + # Legacy analyzer interfaces (for backward compatibility) from codegen_on_oss.analyzers.base_analyzer import BaseCodeAnalyzer from codegen_on_oss.analyzers.codebase_analyzer import CodebaseAnalyzer @@ -85,9 +88,13 @@ # Core analyzers 'CodeQualityAnalyzer', 'DependencyAnalyzer', + + # Diff tracking + 'ChangeType', + 'DiffLite', # Legacy interfaces (for backward compatibility) 'BaseCodeAnalyzer', 'CodebaseAnalyzer', 'ErrorAnalyzer', -] \ No newline at end of file +] diff --git a/codegen-on-oss/codegen_on_oss/analyzers/diff_lite.py b/codegen-on-oss/codegen_on_oss/analyzers/diff_lite.py new file mode 100644 index 000000000..59ba4cf05 --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/analyzers/diff_lite.py @@ -0,0 +1,162 @@ +from enum import IntEnum, auto +from os import PathLike +from pathlib import Path +from typing import NamedTuple, Self + +from git import Diff +from watchfiles import Change + + +class ChangeType(IntEnum): + """ + Enumeration of change types for tracking file modifications. + + Attributes: + Modified: File content has been modified + Removed: File has been deleted + Renamed: File has been renamed + Added: New file has been added + """ + Modified = auto() + Removed = auto() + Renamed = auto() + Added = auto() + + @staticmethod + def from_watch_change_type(change_type: Change) -> 'ChangeType': + """ + Convert watchfiles Change type to ChangeType. + + Args: + change_type: The watchfiles Change enum value + + Returns: + Corresponding ChangeType enum value + """ + if change_type is Change.added: + return ChangeType.Added + elif change_type is Change.deleted: + return ChangeType.Removed + elif change_type is Change.modified: + return ChangeType.Modified + + msg = f"Unsupported watch change type: {change_type}" + raise ValueError(msg) + + @staticmethod + def from_git_change_type(change_type: str | None) -> 'ChangeType': + """ + Convert git change type string to ChangeType. + + Args: + change_type: Git change type string ('M', 'D', 'R', 'A') + + Returns: + Corresponding ChangeType enum value + + Raises: + ValueError: If the change type is not supported + """ + if change_type == "M": + return ChangeType.Modified + if change_type == "D": + return ChangeType.Removed + if change_type == "R": + return ChangeType.Renamed + if change_type == "A": + return ChangeType.Added + + msg = f"Invalid git change type: {change_type}" + raise ValueError(msg) + + +class DiffLite(NamedTuple): + """ + Simple diff implementation for tracking file changes during code analysis. + + This lightweight diff implementation provides support for tracking file changes, + including modifications, removals, renames, and additions. + + Attributes: + change_type: Type of change (Modified, Removed, Renamed, Added) + path: Path to the file + rename_from: Original path for renamed files (None for non-renamed files) + rename_to: New path for renamed files (None for non-renamed files) + old_content: Previous content of the file (None if not available) + """ + change_type: ChangeType + path: Path + rename_from: Path | None = None + rename_to: Path | None = None + old_content: bytes | None = None + + @classmethod + def from_watch_change(cls, change: Change, path: PathLike) -> Self: + """ + Create a DiffLite instance from a watchfiles Change. + + Args: + change: The watchfiles Change enum value + path: Path to the file + + Returns: + DiffLite instance representing the change + """ + return cls( + change_type=ChangeType.from_watch_change_type(change), + path=Path(path), + ) + + @classmethod + def from_git_diff(cls, git_diff: Diff) -> Self: + """ + Create a DiffLite instance from a git Diff object. + + Args: + git_diff: Git Diff object + + Returns: + DiffLite instance representing the git diff + """ + old = None + if git_diff.a_blob: + old = git_diff.a_blob.data_stream.read() + + return cls( + change_type=ChangeType.from_git_change_type(git_diff.change_type), + path=Path(git_diff.a_path) if git_diff.a_path else None, + rename_from=Path(git_diff.rename_from) if git_diff.rename_from else None, + rename_to=Path(git_diff.rename_to) if git_diff.rename_to else None, + old_content=old, + ) + + @classmethod + def from_reverse_diff(cls, diff_lite: "DiffLite") -> Self: + """ + Create a DiffLite instance that represents the reverse of another DiffLite. + + This is useful for undoing changes or representing the opposite operation. + + Args: + diff_lite: Original DiffLite instance + + Returns: + DiffLite instance representing the reverse change + """ + if diff_lite.change_type == ChangeType.Added: + change_type = ChangeType.Removed + elif diff_lite.change_type == ChangeType.Removed: + change_type = ChangeType.Added + else: + change_type = diff_lite.change_type + + if diff_lite.change_type == ChangeType.Renamed: + return cls( + change_type=change_type, + path=diff_lite.path, + rename_from=diff_lite.rename_to, + rename_to=diff_lite.rename_from, + ) + + return cls(change_type=change_type, path=diff_lite.path) + diff --git a/codegen-on-oss/tests/unit/analyzers/test_diff_lite.py b/codegen-on-oss/tests/unit/analyzers/test_diff_lite.py new file mode 100644 index 000000000..537cedd9f --- /dev/null +++ b/codegen-on-oss/tests/unit/analyzers/test_diff_lite.py @@ -0,0 +1,131 @@ +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +from git import Diff +from watchfiles import Change + +from codegen_on_oss.analyzers.diff_lite import ChangeType, DiffLite + + +class TestChangeType(unittest.TestCase): + def test_from_watch_change_type_added(self): + self.assertEqual(ChangeType.from_watch_change_type(Change.added), ChangeType.Added) + + def test_from_watch_change_type_deleted(self): + self.assertEqual(ChangeType.from_watch_change_type(Change.deleted), ChangeType.Removed) + + def test_from_watch_change_type_modified(self): + self.assertEqual(ChangeType.from_watch_change_type(Change.modified), ChangeType.Modified) + + def test_from_watch_change_type_invalid(self): + # Create a mock Change that doesn't match any of the expected values + invalid_change = MagicMock() + with self.assertRaises(ValueError): + ChangeType.from_watch_change_type(invalid_change) + + def test_from_git_change_type_modified(self): + self.assertEqual(ChangeType.from_git_change_type("M"), ChangeType.Modified) + + def test_from_git_change_type_removed(self): + self.assertEqual(ChangeType.from_git_change_type("D"), ChangeType.Removed) + + def test_from_git_change_type_renamed(self): + self.assertEqual(ChangeType.from_git_change_type("R"), ChangeType.Renamed) + + def test_from_git_change_type_added(self): + self.assertEqual(ChangeType.from_git_change_type("A"), ChangeType.Added) + + def test_from_git_change_type_invalid(self): + with self.assertRaises(ValueError): + ChangeType.from_git_change_type("X") + + +class TestDiffLite(unittest.TestCase): + def test_from_watch_change(self): + path = "test/path.py" + diff = DiffLite.from_watch_change(Change.added, path) + + self.assertEqual(diff.change_type, ChangeType.Added) + self.assertEqual(diff.path, Path(path)) + self.assertIsNone(diff.rename_from) + self.assertIsNone(diff.rename_to) + self.assertIsNone(diff.old_content) + + @patch('git.Diff') + def test_from_git_diff_modified(self, mock_diff): + mock_diff.change_type = "M" + mock_diff.a_path = "test/path.py" + mock_diff.rename_from = None + mock_diff.rename_to = None + + # Mock the blob and data stream + mock_blob = MagicMock() + mock_blob.data_stream.read.return_value = b"old content" + mock_diff.a_blob = mock_blob + + diff = DiffLite.from_git_diff(mock_diff) + + self.assertEqual(diff.change_type, ChangeType.Modified) + self.assertEqual(diff.path, Path("test/path.py")) + self.assertIsNone(diff.rename_from) + self.assertIsNone(diff.rename_to) + self.assertEqual(diff.old_content, b"old content") + + @patch('git.Diff') + def test_from_git_diff_renamed(self, mock_diff): + mock_diff.change_type = "R" + mock_diff.a_path = "test/old_path.py" + mock_diff.rename_from = "test/old_path.py" + mock_diff.rename_to = "test/new_path.py" + mock_diff.a_blob = None + + diff = DiffLite.from_git_diff(mock_diff) + + self.assertEqual(diff.change_type, ChangeType.Renamed) + self.assertEqual(diff.path, Path("test/old_path.py")) + self.assertEqual(diff.rename_from, Path("test/old_path.py")) + self.assertEqual(diff.rename_to, Path("test/new_path.py")) + self.assertIsNone(diff.old_content) + + def test_from_reverse_diff_added_to_removed(self): + original = DiffLite( + change_type=ChangeType.Added, + path=Path("test/path.py") + ) + + reversed_diff = DiffLite.from_reverse_diff(original) + + self.assertEqual(reversed_diff.change_type, ChangeType.Removed) + self.assertEqual(reversed_diff.path, Path("test/path.py")) + + def test_from_reverse_diff_removed_to_added(self): + original = DiffLite( + change_type=ChangeType.Removed, + path=Path("test/path.py") + ) + + reversed_diff = DiffLite.from_reverse_diff(original) + + self.assertEqual(reversed_diff.change_type, ChangeType.Added) + self.assertEqual(reversed_diff.path, Path("test/path.py")) + + def test_from_reverse_diff_renamed(self): + original = DiffLite( + change_type=ChangeType.Renamed, + path=Path("test/old_path.py"), + rename_from=Path("test/old_path.py"), + rename_to=Path("test/new_path.py") + ) + + reversed_diff = DiffLite.from_reverse_diff(original) + + self.assertEqual(reversed_diff.change_type, ChangeType.Renamed) + self.assertEqual(reversed_diff.path, Path("test/old_path.py")) + self.assertEqual(reversed_diff.rename_from, Path("test/new_path.py")) + self.assertEqual(reversed_diff.rename_to, Path("test/old_path.py")) + + +if __name__ == "__main__": + unittest.main() + From dee116ba90d092d5c5d5ecae4d8d64ab5d9e002f Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Mon, 12 May 2025 12:39:43 +0000 Subject: [PATCH 02/16] Fix: Replace dateutil.parser with datetime's native parsing --- .../client/openapi_client/api_client.py | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/codegen/agents/client/openapi_client/api_client.py b/src/codegen/agents/client/openapi_client/api_client.py index 02e3fdb6c..6db3434f4 100644 --- a/src/codegen/agents/client/openapi_client/api_client.py +++ b/src/codegen/agents/client/openapi_client/api_client.py @@ -16,11 +16,11 @@ import re import tempfile from enum import Enum -from typing import Optional, Union from urllib.parse import quote +from multiprocessing.pool import ThreadPool -from dateutil.parser import parse -from pydantic import SecretStr +# Import for date parsing +from datetime import datetime as dt import codegen.agents.client.openapi_client as openapi_client from codegen.agents.client.openapi_client import rest @@ -29,7 +29,7 @@ from codegen.agents.client.openapi_client.configuration import Configuration from codegen.agents.client.openapi_client.exceptions import ApiException, ApiValueError -RequestSerialized = tuple[str, str, dict[str, str], Optional[str], list[str]] +RequestSerialized = tuple[str, str, dict[str, str], str | None, list[str]] class ApiClient: @@ -42,8 +42,8 @@ class ApiClient: :param configuration: .Configuration object for this client :param header_name: a header to pass when making calls to the API. - :param header_value: a header value to pass when making calls to - the API. + :param header_value: a header value to pass when making calls + to the API. :param cookie: a cookie to include in the header when making calls to the API """ @@ -227,7 +227,7 @@ def call_api(self, method, url, header_params=None, body=None, post_params=None, return response_data - def response_deserialize(self, response_data: rest.RESTResponse, response_types_map: Optional[dict[str, ApiResponseT]] = None) -> ApiResponse[ApiResponseT]: + def response_deserialize(self, response_data: rest.RESTResponse, response_types_map: dict[str, ApiResponseT] | None = None) -> ApiResponse[ApiResponseT]: """Deserializes response into an object. :param response_data: RESTResponse object to be deserialized. :param response_types_map: dict of response types. @@ -295,7 +295,7 @@ def sanitize_for_serialization(self, obj): return [self.sanitize_for_serialization(sub_obj) for sub_obj in obj] elif isinstance(obj, tuple): return tuple(self.sanitize_for_serialization(sub_obj) for sub_obj in obj) - elif isinstance(obj, (datetime.datetime, datetime.date)): + elif isinstance(obj, datetime.datetime | datetime.date): return obj.isoformat() elif isinstance(obj, decimal.Decimal): return str(obj) @@ -315,7 +315,7 @@ def sanitize_for_serialization(self, obj): return {key: self.sanitize_for_serialization(val) for key, val in obj_dict.items()} - def deserialize(self, response_text: str, response_type: str, content_type: Optional[str]): + def deserialize(self, response_text: str, response_type: str, content_type: str | None): """Deserializes response into an object. :param response: RESTResponse object to be deserialized. @@ -430,7 +430,7 @@ def parameters_to_url_query(self, params, collection_formats): for k, v in params.items() if isinstance(params, dict) else params: if isinstance(v, bool): v = str(v).lower() - if isinstance(v, (int, float)): + if isinstance(v, int | float): v = str(v) if isinstance(v, dict): v = json.dumps(v) @@ -456,7 +456,7 @@ def parameters_to_url_query(self, params, collection_formats): def files_parameters( self, - files: dict[str, Union[str, bytes, list[str], list[bytes], tuple[str, bytes]]], + files: dict[str, str | bytes | list[str] | list[bytes] | tuple[str, bytes]], ): """Builds form parameters. @@ -485,7 +485,7 @@ def files_parameters( params.append(tuple([k, tuple([filename, filedata, mimetype])])) return params - def select_header_accept(self, accepts: list[str]) -> Optional[str]: + def select_header_accept(self, accepts: list[str]) -> str | None: """Returns `Accept` based on an array of accepts provided. :param accepts: List of headers. @@ -618,7 +618,8 @@ def __deserialize_date(self, string): :return: date. """ try: - return parse(string).date() + # Use datetime's own parsing instead of dateutil + return dt.fromisoformat(string.replace('Z', '+00:00')).date() except ImportError: return string except ValueError: @@ -633,7 +634,8 @@ def __deserialize_datetime(self, string): :return: datetime. """ try: - return parse(string) + # Use datetime's own parsing instead of dateutil + return dt.fromisoformat(string.replace('Z', '+00:00')) except ImportError: return string except ValueError: From e333dab09a5018fbca1b5faa4e65e8ee5b74b749 Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Mon, 12 May 2025 12:39:47 +0000 Subject: [PATCH 03/16] Fix: Apply ruff formatting and linting fixes --- .../swebench_agent_run/local_run.ipynb | 16 +- .../codegen_on_oss/analyzers/__init__.py | 123 +- .../codegen_on_oss/analyzers/analyzer.py | 686 +++--- .../analyzers/analyzer_manager.py | 436 ++-- .../codegen_on_oss/analyzers/api.py | 416 ++-- .../codegen_on_oss/analyzers/base_analyzer.py | 216 +- .../codegen_on_oss/analyzers/code_quality.py | 1253 ++++++----- .../analyzers/code_quality_analyzer.py | 591 +++--- .../analyzers/codebase_analyzer.py | 1860 ++++++++++------- .../analyzers/codebase_context.py | 484 +++-- .../analyzers/codebase_visualizer.py | 1185 ++++++----- .../analyzers/context/__init__.py | 8 +- .../analyzers/context/codebase.py | 332 +-- .../codegen_on_oss/analyzers/context/file.py | 259 +-- .../analyzers/context/function.py | 257 ++- .../analyzers/context/graph/__init__.py | 106 +- .../analyzers/context_codebase.py | 553 ++--- .../analyzers/current_code_codebase.py | 153 +- .../codegen_on_oss/analyzers/dependencies.py | 884 ++++---- .../analyzers/dependency_analyzer.py | 460 ++-- .../codegen_on_oss/analyzers/diff_lite.py | 43 +- .../analyzers/error_analyzer.py | 325 +-- .../analyzers/issue_analyzer.py | 149 +- .../codegen_on_oss/analyzers/issue_types.py | 34 +- .../codegen_on_oss/analyzers/issues.py | 333 +-- .../analyzers/models/analysis_result.py | 267 ++- .../resolution/resolution_manager.py | 439 ++-- .../analyzers/snapshot/snapshot_manager.py | 456 ++-- .../analyzers/unified_analyzer.py | 1416 +++++++------ .../visualization/analysis_visualizer.py | 406 ++-- .../visualization/code_visualizer.py | 398 ++-- .../visualization/codebase_visualizer.py | 307 ++- .../analyzers/visualization/visualizer.py | 223 +- .../codegen_on_oss/error_analyzer.py | 1402 +++++++------ .../tests/unit/analyzers/test_diff_lite.py | 54 +- organize_codebase.py | 145 +- organize_specific_codebase.py | 98 +- organize_with_codegen_sdk.py | 126 +- src/codegen/agents/agent.py | 6 +- src/codegen/agents/chat_agent.py | 8 +- .../client/openapi_client/api/agents_api.py | 254 +-- .../openapi_client/api/organizations_api.py | 164 +- .../client/openapi_client/api/users_api.py | 290 +-- .../client/openapi_client/configuration.py | 57 +- .../client/openapi_client/exceptions.py | 12 +- .../models/agent_run_response.py | 3 +- .../models/create_agent_run_input.py | 3 +- .../models/http_validation_error.py | 3 +- .../models/organization_response.py | 3 +- .../models/organization_settings.py | 3 +- .../models/page_organization_response.py | 3 +- .../models/page_user_response.py | 3 +- .../openapi_client/models/user_response.py | 3 +- .../openapi_client/models/validation_error.py | 3 +- .../models/validation_error_loc_inner.py | 3 +- .../agents/client/openapi_client/rest.py | 2 +- src/codegen/agents/code_agent.py | 18 +- src/codegen/agents/data.py | 18 +- src/codegen/agents/tracer.py | 6 +- src/codegen/cli/commands/serve/main.py | 3 +- .../extensions/attribution/git_history.py | 7 +- src/codegen/extensions/attribution/main.py | 5 +- src/codegen/extensions/events/codegen_app.py | 4 +- src/codegen/extensions/events/github.py | 3 +- src/codegen/extensions/events/github_types.py | 9 +- src/codegen/extensions/events/linear.py | 3 +- .../extensions/github/types/pull_request.py | 24 +- src/codegen/extensions/github/types/push.py | 3 +- src/codegen/extensions/graph/create_graph.py | 3 +- .../extensions/graph/neo4j_exporter.py | 4 +- src/codegen/extensions/index/code_index.py | 12 +- src/codegen/extensions/index/file_index.py | 3 +- src/codegen/extensions/langchain/graph.py | 10 +- src/codegen/extensions/langchain/llm.py | 12 +- src/codegen/extensions/langchain/tools.py | 20 +- .../langchain/utils/custom_tool_node.py | 10 +- .../langchain/utils/get_langsmith_url.py | 5 +- .../extensions/linear/linear_client.py | 3 +- src/codegen/extensions/lsp/definition.py | 2 +- src/codegen/extensions/lsp/execute.py | 3 +- src/codegen/extensions/lsp/server.py | 8 +- src/codegen/extensions/mcp/codebase_tools.py | 14 +- src/codegen/extensions/swebench/utils.py | 10 +- src/codegen/extensions/tools/bash.py | 8 +- src/codegen/extensions/tools/create_file.py | 4 +- src/codegen/extensions/tools/edit_file.py | 4 +- .../tools/github/create_pr_review_comment.py | 4 +- .../extensions/tools/link_annotation.py | 2 +- src/codegen/extensions/tools/observation.py | 6 +- src/codegen/extensions/tools/reflection.py | 10 +- .../extensions/tools/replacement_edit.py | 14 +- src/codegen/extensions/tools/reveal_symbol.py | 28 +- .../extensions/tools/search_files_by_name.py | 9 +- src/codegen/extensions/tools/semantic_edit.py | 8 +- .../extensions/tools/semantic_search.py | 4 +- .../extensions/tools/tool_output_types.py | 56 +- src/codegen/extensions/tools/view_file.py | 16 +- src/codegen/runner/sandbox/middlewares.py | 3 +- src/codegen/sdk/codebase/multigraph.py | 5 +- src/codegen/sdk/core/codeowner.py | 4 +- src/codegen/sdk/core/utils/cache_utils.py | 4 +- src/codegen/sdk/types.py | 3 +- .../shared/compilation/exception_utils.py | 9 +- tests/unit/codegen/agents/test_api_client.py | 2 +- .../extensions/lsp/test_document_symbols.py | 8 +- .../extensions/lsp/test_workspace_sync.py | 2 +- 106 files changed, 10084 insertions(+), 8073 deletions(-) diff --git a/codegen-examples/examples/swebench_agent_run/local_run.ipynb b/codegen-examples/examples/swebench_agent_run/local_run.ipynb index f2f73c922..237732bbf 100644 --- a/codegen-examples/examples/swebench_agent_run/local_run.ipynb +++ b/codegen-examples/examples/swebench_agent_run/local_run.ipynb @@ -32,7 +32,14 @@ "metadata": {}, "outputs": [], "source": [ - "await run_eval(use_existing_preds=None, dataset=\"lite\", length=5, repo=\"django/django\", num_workers=10, model=\"claude-3-7-sonnet-latest\")" + "await run_eval(\n", + " use_existing_preds=None,\n", + " dataset=\"lite\",\n", + " length=5,\n", + " repo=\"django/django\",\n", + " num_workers=10,\n", + " model=\"claude-3-7-sonnet-latest\",\n", + ")" ] }, { @@ -76,7 +83,12 @@ "source": [ "from codegen.agents.code_agent import CodeAgent\n", "\n", - "agent = CodeAgent(codebase=codebase, tags=[\"local_test\"], model_name=\"claude-3-5-sonnet-latest\", model_provider=\"anthropic\")" + "agent = CodeAgent(\n", + " codebase=codebase,\n", + " tags=[\"local_test\"],\n", + " model_name=\"claude-3-5-sonnet-latest\",\n", + " model_provider=\"anthropic\",\n", + ")" ] }, { diff --git a/codegen-on-oss/codegen_on_oss/analyzers/__init__.py b/codegen-on-oss/codegen_on_oss/analyzers/__init__.py index 80f1eab6d..5d4a9394f 100644 --- a/codegen-on-oss/codegen_on_oss/analyzers/__init__.py +++ b/codegen-on-oss/codegen_on_oss/analyzers/__init__.py @@ -7,32 +7,43 @@ """ # Main API interface -from codegen_on_oss.analyzers.api import ( - CodegenAnalyzerAPI, - create_api, - api_analyze_codebase, - api_analyze_pr, - api_get_visualization, - api_get_static_errors -) - # Modern analyzer architecture from codegen_on_oss.analyzers.analyzer import ( AnalyzerManager, AnalyzerPlugin, AnalyzerRegistry, CodeQualityPlugin, - DependencyPlugin + DependencyPlugin, +) +from codegen_on_oss.analyzers.api import ( + CodegenAnalyzerAPI, + api_analyze_codebase, + api_analyze_pr, + api_get_static_errors, + api_get_visualization, + create_api, ) +# Legacy analyzer interfaces (for backward compatibility) +from codegen_on_oss.analyzers.base_analyzer import BaseCodeAnalyzer + +# Core analysis modules +from codegen_on_oss.analyzers.code_quality import CodeQualityAnalyzer +from codegen_on_oss.analyzers.codebase_analyzer import CodebaseAnalyzer +from codegen_on_oss.analyzers.dependencies import DependencyAnalyzer + +# Diff tracking +from codegen_on_oss.analyzers.diff_lite import ChangeType, DiffLite +from codegen_on_oss.analyzers.error_analyzer import CodebaseAnalyzer as ErrorAnalyzer + # Issue tracking system from codegen_on_oss.analyzers.issues import ( + AnalysisType, + CodeLocation, Issue, + IssueCategory, IssueCollection, IssueSeverity, - AnalysisType, - IssueCategory, - CodeLocation ) # Analysis result models @@ -40,61 +51,43 @@ AnalysisResult, CodeQualityResult, DependencyResult, - PrAnalysisResult + PrAnalysisResult, ) -# Core analysis modules -from codegen_on_oss.analyzers.code_quality import CodeQualityAnalyzer -from codegen_on_oss.analyzers.dependencies import DependencyAnalyzer - -# Diff tracking -from codegen_on_oss.analyzers.diff_lite import ChangeType, DiffLite - -# Legacy analyzer interfaces (for backward compatibility) -from codegen_on_oss.analyzers.base_analyzer import BaseCodeAnalyzer -from codegen_on_oss.analyzers.codebase_analyzer import CodebaseAnalyzer -from codegen_on_oss.analyzers.error_analyzer import CodebaseAnalyzer as ErrorAnalyzer - __all__ = [ - # Main API - 'CodegenAnalyzerAPI', - 'create_api', - 'api_analyze_codebase', - 'api_analyze_pr', - 'api_get_visualization', - 'api_get_static_errors', - - # Modern architecture - 'AnalyzerManager', - 'AnalyzerPlugin', - 'AnalyzerRegistry', - 'CodeQualityPlugin', - 'DependencyPlugin', - - # Issue tracking - 'Issue', - 'IssueCollection', - 'IssueSeverity', - 'AnalysisType', - 'IssueCategory', - 'CodeLocation', - # Analysis results - 'AnalysisResult', - 'CodeQualityResult', - 'DependencyResult', - 'PrAnalysisResult', - - # Core analyzers - 'CodeQualityAnalyzer', - 'DependencyAnalyzer', - - # Diff tracking - 'ChangeType', - 'DiffLite', - + "AnalysisResult", + "AnalysisType", + # Modern architecture + "AnalyzerManager", + "AnalyzerPlugin", + "AnalyzerRegistry", # Legacy interfaces (for backward compatibility) - 'BaseCodeAnalyzer', - 'CodebaseAnalyzer', - 'ErrorAnalyzer', + "BaseCodeAnalyzer", + # Diff tracking + "ChangeType", + "CodeLocation", + # Core analyzers + "CodeQualityAnalyzer", + "CodeQualityPlugin", + "CodeQualityResult", + "CodebaseAnalyzer", + # Main API + "CodegenAnalyzerAPI", + "DependencyAnalyzer", + "DependencyPlugin", + "DependencyResult", + "DiffLite", + "ErrorAnalyzer", + # Issue tracking + "Issue", + "IssueCategory", + "IssueCollection", + "IssueSeverity", + "PrAnalysisResult", + "api_analyze_codebase", + "api_analyze_pr", + "api_get_static_errors", + "api_get_visualization", + "create_api", ] diff --git a/codegen-on-oss/codegen_on_oss/analyzers/analyzer.py b/codegen-on-oss/codegen_on_oss/analyzers/analyzer.py index 4337bba5b..55963544e 100644 --- a/codegen-on-oss/codegen_on_oss/analyzers/analyzer.py +++ b/codegen-on-oss/codegen_on_oss/analyzers/analyzer.py @@ -7,35 +7,37 @@ It serves as the primary API entry point for the analyzer backend. """ -import os -import sys import json import logging +import sys from datetime import datetime -from pathlib import Path -from typing import Dict, List, Set, Tuple, Any, Optional, Union, Type, Callable -from enum import Enum +from typing import Any try: - from codegen.sdk.core.codebase import Codebase from codegen.configs.models.codebase import CodebaseConfig from codegen.configs.models.secrets import SecretsConfig - from codegen.sdk.codebase.config import ProjectConfig - from codegen.git.schemas.repo_config import RepoConfig from codegen.git.repo_operator.repo_operator import RepoOperator + from codegen.git.schemas.repo_config import RepoConfig + from codegen.sdk.codebase.config import ProjectConfig + from codegen.sdk.core.codebase import Codebase from codegen.shared.enums.programming_language import ProgrammingLanguage except ImportError: print("Codegen SDK not found. Please install it first.") sys.exit(1) # Import internal modules - these will be replaced with actual imports once implemented -from codegen_on_oss.analyzers.issues import Issue, IssueSeverity, AnalysisType, IssueCategory +from codegen_on_oss.analyzers.issues import ( + AnalysisType, + Issue, + IssueCategory, + IssueSeverity, +) # Configure logging logging.basicConfig( level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[logging.StreamHandler()] + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], ) logger = logging.getLogger(__name__) @@ -56,108 +58,115 @@ ".vscode", ] + class AnalyzerRegistry: """Registry of analyzer plugins.""" - + _instance = None - + def __new__(cls): if cls._instance is None: - cls._instance = super(AnalyzerRegistry, cls).__new__(cls) + cls._instance = super().__new__(cls) cls._instance._analyzers = {} return cls._instance - - def register(self, analysis_type: AnalysisType, analyzer_class: Type['AnalyzerPlugin']): + + def register( + self, analysis_type: AnalysisType, analyzer_class: type["AnalyzerPlugin"] + ): """Register an analyzer plugin.""" self._analyzers[analysis_type] = analyzer_class - - def get_analyzer(self, analysis_type: AnalysisType) -> Optional[Type['AnalyzerPlugin']]: + + def get_analyzer( + self, analysis_type: AnalysisType + ) -> type["AnalyzerPlugin"] | None: """Get the analyzer plugin for a specific analysis type.""" return self._analyzers.get(analysis_type) - - def list_analyzers(self) -> Dict[AnalysisType, Type['AnalyzerPlugin']]: + + def list_analyzers(self) -> dict[AnalysisType, type["AnalyzerPlugin"]]: """Get all registered analyzers.""" return self._analyzers.copy() + class AnalyzerPlugin: """Base class for analyzer plugins.""" - - def __init__(self, manager: 'AnalyzerManager'): + + def __init__(self, manager: "AnalyzerManager"): """Initialize the analyzer plugin.""" self.manager = manager self.issues = [] - - def analyze(self) -> Dict[str, Any]: + + def analyze(self) -> dict[str, Any]: """Perform analysis using this plugin.""" raise NotImplementedError("Analyzer plugins must implement analyze()") - + def add_issue(self, issue: Issue): """Add an issue to the list.""" self.manager.add_issue(issue) self.issues.append(issue) + class CodeQualityPlugin(AnalyzerPlugin): """Plugin for code quality analysis.""" - - def analyze(self) -> Dict[str, Any]: + + def analyze(self) -> dict[str, Any]: """Perform code quality analysis.""" # This is a simplified placeholder - would import and use code_quality.py result = { "dead_code": self._find_dead_code(), "complexity": self._analyze_complexity(), "maintainability": self._analyze_maintainability(), - "style_issues": self._analyze_style_issues() + "style_issues": self._analyze_style_issues(), } return result - - def _find_dead_code(self) -> Dict[str, Any]: + + def _find_dead_code(self) -> dict[str, Any]: """Find unused code in the codebase.""" # This is a placeholder return {"unused_functions": [], "unused_classes": [], "unused_variables": []} - - def _analyze_complexity(self) -> Dict[str, Any]: + + def _analyze_complexity(self) -> dict[str, Any]: """Analyze code complexity.""" # This is a placeholder return {"complex_functions": [], "average_complexity": 0} - - def _analyze_maintainability(self) -> Dict[str, Any]: + + def _analyze_maintainability(self) -> dict[str, Any]: """Analyze code maintainability.""" # This is a placeholder return {"maintainability_index": {}} - - def _analyze_style_issues(self) -> Dict[str, Any]: + + def _analyze_style_issues(self) -> dict[str, Any]: """Analyze code style issues.""" # This is a placeholder return {"style_violations": []} + class DependencyPlugin(AnalyzerPlugin): """Plugin for dependency analysis.""" - def analyze(self) -> Dict[str, Any]: + def analyze(self) -> dict[str, Any]: """Perform dependency analysis using the DependencyAnalyzer.""" - from codegen_on_oss.analyzers.dependencies import DependencyAnalyzer from codegen_on_oss.analyzers.codebase_context import CodebaseContext + from codegen_on_oss.analyzers.dependencies import DependencyAnalyzer # Create context if needed - context = getattr(self.manager, 'base_context', None) - if not context and hasattr(self.manager, 'base_codebase'): + context = getattr(self.manager, "base_context", None) + if not context and hasattr(self.manager, "base_codebase"): try: context = CodebaseContext( codebase=self.manager.base_codebase, base_path=self.manager.repo_path, pr_branch=None, - base_branch=self.manager.base_branch + base_branch=self.manager.base_branch, ) # Save context for future use self.manager.base_context = context except Exception as e: - logger.error(f"Error initializing context: {e}") + logger.exception(f"Error initializing context: {e}") # Initialize and run the dependency analyzer if context: dependency_analyzer = DependencyAnalyzer( - codebase=self.manager.base_codebase, - context=context + codebase=self.manager.base_codebase, context=context ) # Run analysis @@ -173,43 +182,44 @@ def analyze(self) -> Dict[str, Any]: result = { "import_dependencies": self._analyze_imports(), "circular_dependencies": self._find_circular_dependencies(), - "module_coupling": self._analyze_module_coupling() + "module_coupling": self._analyze_module_coupling(), } return result - def _analyze_imports(self) -> Dict[str, Any]: + def _analyze_imports(self) -> dict[str, Any]: """Fallback import analysis if context initialization failed.""" return {"module_dependencies": [], "external_dependencies": []} - def _find_circular_dependencies(self) -> Dict[str, Any]: + def _find_circular_dependencies(self) -> dict[str, Any]: """Fallback circular dependencies analysis if context initialization failed.""" return {"circular_imports": []} - def _analyze_module_coupling(self) -> Dict[str, Any]: + def _analyze_module_coupling(self) -> dict[str, Any]: """Fallback module coupling analysis if context initialization failed.""" return {"high_coupling_modules": []} + class AnalyzerManager: """ Unified manager for codebase analysis. - + This class serves as the main entry point for all analysis operations, coordinating different analyzer plugins and managing results. """ - + def __init__( self, - repo_url: Optional[str] = None, - repo_path: Optional[str] = None, + repo_url: str | None = None, + repo_path: str | None = None, base_branch: str = "main", - pr_number: Optional[int] = None, - language: Optional[str] = None, - file_ignore_list: Optional[List[str]] = None, - config: Optional[Dict[str, Any]] = None + pr_number: int | None = None, + language: str | None = None, + file_ignore_list: list[str] | None = None, + config: dict[str, Any] | None = None, ): """ Initialize the analyzer manager. - + Args: repo_url: URL of the repository to analyze repo_path: Local path to the repository to analyze @@ -224,88 +234,89 @@ def __init__( self.base_branch = base_branch self.pr_number = pr_number self.language = language - + # Use custom ignore list or default global list self.file_ignore_list = file_ignore_list or GLOBAL_FILE_IGNORE_LIST - + # Configuration options self.config = config or {} - + # Codebase and context objects self.base_codebase = None self.pr_codebase = None - + # Analysis results self.issues = [] self.results = {} - + # PR comparison data self.pr_diff = None self.commit_shas = None self.modified_symbols = None self.pr_branch = None - + # Initialize codebase(s) based on provided parameters if repo_url: self._init_from_url(repo_url, language) elif repo_path: self._init_from_path(repo_path, language) - + # If PR number is provided, initialize PR-specific data if self.pr_number is not None and self.base_codebase is not None: self._init_pr_data(self.pr_number) - + # Register default analyzers self._register_default_analyzers() - - def _init_from_url(self, repo_url: str, language: Optional[str] = None): + + def _init_from_url(self, repo_url: str, language: str | None = None): """Initialize codebase from a repository URL.""" try: # Extract repository information - if repo_url.endswith('.git'): + if repo_url.endswith(".git"): repo_url = repo_url[:-4] - - parts = repo_url.rstrip('/').split('/') + + parts = repo_url.rstrip("/").split("/") repo_name = parts[-1] owner = parts[-2] repo_full_name = f"{owner}/{repo_name}" - + # Create temporary directory for cloning import tempfile + tmp_dir = tempfile.mkdtemp(prefix="analyzer_") - + # Set up configuration config = CodebaseConfig( debug=False, allow_external=True, py_resolve_syspath=True, ) - + secrets = SecretsConfig() - + # Determine programming language prog_lang = None if language: prog_lang = ProgrammingLanguage(language.upper()) - + # Initialize the codebase logger.info(f"Initializing codebase from {repo_url}") - + self.base_codebase = Codebase.from_github( repo_full_name=repo_full_name, tmp_dir=tmp_dir, language=prog_lang, config=config, - secrets=secrets + secrets=secrets, ) - + logger.info(f"Successfully initialized codebase from {repo_url}") - + except Exception as e: - logger.error(f"Error initializing codebase from URL: {e}") + logger.exception(f"Error initializing codebase from URL: {e}") raise - - def _init_from_path(self, repo_path: str, language: Optional[str] = None): + + def _init_from_path(self, repo_path: str, language: str | None = None): """Initialize codebase from a local repository path.""" try: # Set up configuration @@ -314,163 +325,165 @@ def _init_from_path(self, repo_path: str, language: Optional[str] = None): allow_external=True, py_resolve_syspath=True, ) - + secrets = SecretsConfig() - + # Initialize the codebase logger.info(f"Initializing codebase from {repo_path}") - + # Determine programming language prog_lang = None if language: prog_lang = ProgrammingLanguage(language.upper()) - + # Set up repository configuration repo_config = RepoConfig.from_repo_path(repo_path) repo_config.respect_gitignore = False repo_operator = RepoOperator(repo_config=repo_config, bot_commit=False) - + # Create project configuration project_config = ProjectConfig( repo_operator=repo_operator, - programming_language=prog_lang if prog_lang else None + programming_language=prog_lang if prog_lang else None, ) - + # Initialize codebase self.base_codebase = Codebase( - projects=[project_config], - config=config, - secrets=secrets + projects=[project_config], config=config, secrets=secrets ) - + logger.info(f"Successfully initialized codebase from {repo_path}") - + except Exception as e: - logger.error(f"Error initializing codebase from path: {e}") + logger.exception(f"Error initializing codebase from path: {e}") raise - + def _init_pr_data(self, pr_number: int): """Initialize PR-specific data.""" try: logger.info(f"Fetching PR #{pr_number} data") result = self.base_codebase.get_modified_symbols_in_pr(pr_number) - + # Unpack the result tuple if len(result) >= 3: self.pr_diff, self.commit_shas, self.modified_symbols = result[:3] if len(result) >= 4: self.pr_branch = result[3] - + logger.info(f"Found {len(self.modified_symbols)} modified symbols in PR") - + # Initialize PR codebase self._init_pr_codebase() - + except Exception as e: - logger.error(f"Error initializing PR data: {e}") + logger.exception(f"Error initializing PR data: {e}") raise - + def _init_pr_codebase(self): """Initialize PR codebase by checking out the PR branch.""" if not self.base_codebase or not self.pr_number: logger.error("Base codebase or PR number not initialized") return - + try: # Get PR data if not already fetched if not self.pr_branch: self._init_pr_data(self.pr_number) - + if not self.pr_branch: logger.error("Failed to get PR branch") return - + # Clone the base codebase self.pr_codebase = self.base_codebase - + # Checkout PR branch logger.info(f"Checking out PR branch: {self.pr_branch}") self.pr_codebase.checkout(self.pr_branch) - + logger.info("Successfully initialized PR codebase") - + except Exception as e: - logger.error(f"Error initializing PR codebase: {e}") + logger.exception(f"Error initializing PR codebase: {e}") raise - + def _register_default_analyzers(self): """Register default analyzers.""" registry = AnalyzerRegistry() registry.register(AnalysisType.CODE_QUALITY, CodeQualityPlugin) registry.register(AnalysisType.DEPENDENCY, DependencyPlugin) - + def add_issue(self, issue: Issue): """Add an issue to the list.""" # Check if issue should be skipped if self._should_skip_issue(issue): return - + self.issues.append(issue) - + def _should_skip_issue(self, issue: Issue) -> bool: """Check if an issue should be skipped.""" # Skip issues in ignored files file_path = issue.file - + # Check against ignore list for pattern in self.file_ignore_list: if pattern in file_path: return True - + # Check if the file is a test file if "test" in file_path.lower() or "tests" in file_path.lower(): # Skip low-severity issues in test files if issue.severity in [IssueSeverity.INFO, IssueSeverity.WARNING]: return True - + return False - - def get_issues(self, severity: Optional[IssueSeverity] = None, category: Optional[IssueCategory] = None) -> List[Issue]: + + def get_issues( + self, + severity: IssueSeverity | None = None, + category: IssueCategory | None = None, + ) -> list[Issue]: """ Get all issues matching the specified criteria. - + Args: severity: Optional severity level to filter by category: Optional category to filter by - + Returns: List of matching issues """ filtered_issues = self.issues - + if severity: filtered_issues = [i for i in filtered_issues if i.severity == severity] - + if category: filtered_issues = [i for i in filtered_issues if i.category == category] - + return filtered_issues - + def analyze( - self, - analysis_types: Optional[List[Union[AnalysisType, str]]] = None, - output_file: Optional[str] = None, - output_format: str = "json" - ) -> Dict[str, Any]: + self, + analysis_types: list[AnalysisType | str] | None = None, + output_file: str | None = None, + output_format: str = "json", + ) -> dict[str, Any]: """ Perform analysis on the codebase. - + Args: analysis_types: List of analysis types to perform output_file: Path to save results to output_format: Format of the output file - + Returns: Dictionary containing analysis results """ if not self.base_codebase: raise ValueError("Codebase not initialized") - + # Convert string analysis types to enums if analysis_types: analysis_types = [ @@ -480,78 +493,92 @@ def analyze( else: # Default to code quality and dependency analysis analysis_types = [AnalysisType.CODE_QUALITY, AnalysisType.DEPENDENCY] - + # Initialize results self.results = { "metadata": { "analysis_time": datetime.now().isoformat(), "analysis_types": [t.value for t in analysis_types], - "repo_name": getattr(self.base_codebase.ctx, 'repo_name', None), - "language": str(getattr(self.base_codebase.ctx, 'programming_language', None)), + "repo_name": getattr(self.base_codebase.ctx, "repo_name", None), + "language": str( + getattr(self.base_codebase.ctx, "programming_language", None) + ), }, "summary": {}, - "results": {} + "results": {}, } - + # Reset issues self.issues = [] - + # Run each analyzer registry = AnalyzerRegistry() - + for analysis_type in analysis_types: analyzer_class = registry.get_analyzer(analysis_type) - + if analyzer_class: logger.info(f"Running {analysis_type.value} analysis") analyzer = analyzer_class(self) analysis_result = analyzer.analyze() - + # Add results to unified results self.results["results"][analysis_type.value] = analysis_result else: logger.warning(f"No analyzer found for {analysis_type.value}") - + # Add issues to results self.results["issues"] = [issue.to_dict() for issue in self.issues] - + # Add issue statistics self.results["issue_stats"] = { "total": len(self.issues), "by_severity": { - "critical": sum(1 for issue in self.issues if issue.severity == IssueSeverity.CRITICAL), - "error": sum(1 for issue in self.issues if issue.severity == IssueSeverity.ERROR), - "warning": sum(1 for issue in self.issues if issue.severity == IssueSeverity.WARNING), - "info": sum(1 for issue in self.issues if issue.severity == IssueSeverity.INFO), - } + "critical": sum( + 1 + for issue in self.issues + if issue.severity == IssueSeverity.CRITICAL + ), + "error": sum( + 1 for issue in self.issues if issue.severity == IssueSeverity.ERROR + ), + "warning": sum( + 1 + for issue in self.issues + if issue.severity == IssueSeverity.WARNING + ), + "info": sum( + 1 for issue in self.issues if issue.severity == IssueSeverity.INFO + ), + }, } - + # Save results if output file is specified if output_file: self.save_results(output_file, output_format) - + return self.results - + def save_results(self, output_file: str, format: str = "json"): """ Save analysis results to a file. - + Args: output_file: Path to the output file format: Output format (json, html) """ if format == "json": - with open(output_file, 'w') as f: + with open(output_file, "w") as f: json.dump(self.results, f, indent=2) elif format == "html": self._generate_html_report(output_file) else: # Default to JSON - with open(output_file, 'w') as f: + with open(output_file, "w") as f: json.dump(self.results, f, indent=2) - + logger.info(f"Results saved to {output_file}") - + def _generate_html_report(self, output_file: str): """Generate an HTML report of the analysis results.""" html_content = f""" @@ -578,85 +605,93 @@ def _generate_html_report(self, output_file: str):
Repository: {self.results['metadata'].get('repo_name', 'Unknown')}
-Language: {self.results['metadata'].get('language', 'Unknown')}
-Analysis Time: {self.results['metadata'].get('analysis_time', 'Unknown')}
-Analysis Types: {', '.join(self.results['metadata'].get('analysis_types', []))}
+Repository: {self.results["metadata"].get("repo_name", "Unknown")}
+Language: {self.results["metadata"].get("language", "Unknown")}
+Analysis Time: {self.results["metadata"].get("analysis_time", "Unknown")}
+Analysis Types: {", ".join(self.results["metadata"].get("analysis_types", []))}
Total Issues: {len(self.issues)}
{location} {category} {issue.message}
-{issue.suggestion if hasattr(issue, 'suggestion') else ""}
+{issue.suggestion if hasattr(issue, "suggestion") else ""}
{json.dumps(results, indent=2)}
"""
-
+
html_content += """