From e2e79934b4b9d97990a36a758936e9559d220758 Mon Sep 17 00:00:00 2001 From: tkucar Date: Tue, 4 Mar 2025 02:31:56 +0100 Subject: [PATCH 1/7] init --- .../examples/ai_impact_analysis/run.py | 69 ++ .../attributions/symbol_attribution.py | 93 +++ pyproject.toml | 2 +- .../extensions/attribution/3pp/__init__.py | 1 + .../extensions/attribution/3pp/cursor.py | 592 ++++++++++++++++++ .../extensions/attribution/3pp/windsurf.py | 186 ++++++ src/codegen/extensions/attribution/cli.py | 153 +++++ .../extensions/attribution/git_history.py | 326 ++++++++++ src/codegen/extensions/attribution/main.py | 103 +++ 9 files changed, 1524 insertions(+), 1 deletion(-) create mode 100644 codegen-examples/examples/ai_impact_analysis/run.py create mode 100644 codegen-examples/examples/attributions/symbol_attribution.py create mode 100644 src/codegen/extensions/attribution/3pp/__init__.py create mode 100644 src/codegen/extensions/attribution/3pp/cursor.py create mode 100644 src/codegen/extensions/attribution/3pp/windsurf.py create mode 100644 src/codegen/extensions/attribution/cli.py create mode 100644 src/codegen/extensions/attribution/git_history.py create mode 100644 src/codegen/extensions/attribution/main.py diff --git a/codegen-examples/examples/ai_impact_analysis/run.py b/codegen-examples/examples/ai_impact_analysis/run.py new file mode 100644 index 000000000..7ff9026a8 --- /dev/null +++ b/codegen-examples/examples/ai_impact_analysis/run.py @@ -0,0 +1,69 @@ +import sys +import traceback +import os + +from codegen import Codebase +from codegen.extensions.attribution.cli import run +from codegen.git.repo_operator.repo_operator import RepoOperator +from codegen.git.schemas.repo_config import RepoConfig +from codegen.sdk.codebase.config import ProjectConfig +from codegen.shared.enums.programming_language import ProgrammingLanguage + +if __name__ == "__main__": + try: + print("Initializing codebase...") + + # Option A: Use current directory if it's a git repository + if os.path.exists(".git"): + print("Using current directory as repository...") + # Create a repo operator for the current directory + repo_path = os.getcwd() + repo_config = RepoConfig.from_repo_path(repo_path) + repo_operator = RepoOperator(repo_config=repo_config) + + # Initialize codebase with a project config + project = ProjectConfig.from_repo_operator( + repo_operator=repo_operator, + programming_language=ProgrammingLanguage.PYTHON + ) + codebase = Codebase(projects=[project]) + else: + # Option B: Try to find a git repository in parent directories + print("Searching for git repository in parent directories...") + current_dir = os.getcwd() + found_git = False + + while current_dir != os.path.dirname(current_dir): # Stop at root + if os.path.exists(os.path.join(current_dir, ".git")): + print(f"Found git repository at {current_dir}") + repo_config = RepoConfig.from_repo_path(current_dir) + repo_operator = RepoOperator(repo_config=repo_config) + + # Initialize codebase with a project config + project = ProjectConfig.from_repo_operator( + repo_operator=repo_operator, + programming_language=ProgrammingLanguage.PYTHON + ) + codebase = Codebase(projects=[project]) + found_git = True + break + current_dir = os.path.dirname(current_dir) + + if not found_git: + # Option C: Use from_repo method which handles cloning + print("No local git repository found. Cloning a repository...") + codebase = Codebase.from_repo( + repo_full_name="codegen-sh/codegen", + language="python" + ) + + print(f"Codebase loaded with {len(codebase.files)} files and {len(codebase.symbols)} symbols") + + # Run the analysis + run(codebase) + + except Exception as e: + print(f"\nāŒ Error: {str(e)}") + print("\nTraceback:") + traceback.print_exc() + sys.exit(1) \ No newline at end of file diff --git a/codegen-examples/examples/attributions/symbol_attribution.py b/codegen-examples/examples/attributions/symbol_attribution.py new file mode 100644 index 000000000..16cd8179e --- /dev/null +++ b/codegen-examples/examples/attributions/symbol_attribution.py @@ -0,0 +1,93 @@ +import os +import sys + +from codegen import Codebase +from codegen.extensions.attribution.cli import run +from codegen.extensions.attribution.main import add_attribution_to_symbols +from codegen.git.repo_operator.repo_operator import RepoOperator +from codegen.git.schemas.repo_config import RepoConfig +from codegen.sdk.codebase.config import ProjectConfig +from codegen.shared.enums.programming_language import ProgrammingLanguage + +def print_symbol_attribution(codebase): + """Print attribution information for symbols in the codebase.""" + print("\nšŸ” Symbol Attribution Examples:") + + # First, make sure attribution information is added to symbols + ai_authors = ['devin[bot]', 'codegen[bot]', 'github-actions[bot]'] + add_attribution_to_symbols(codebase, ai_authors) + + # Get some interesting symbols to examine + # Let's look at classes and functions with the most usages + symbols_with_usages = [] + for symbol in codebase.symbols: + if hasattr(symbol, 'usages') and len(symbol.usages) > 0: + symbols_with_usages.append((symbol, len(symbol.usages))) + + # Sort by usage count (most used first) + symbols_with_usages.sort(key=lambda x: x[1], reverse=True) + + # Print attribution for top symbols + count = 0 + for symbol, usage_count in symbols_with_usages[:10]: # Look at top 10 most used symbols + count += 1 + print(f"\nšŸ“Š Symbol #{count}: {symbol.name} ({type(symbol).__name__})") + print(f" • File: {symbol.filepath}") + print(f" • Usages: {usage_count}") + + # Print attribution information + if hasattr(symbol, 'last_editor'): + print(f" • Last editor: {symbol.last_editor}") + else: + print(" • Last editor: Not available") + + if hasattr(symbol, 'editor_history') and symbol.editor_history: + print(f" • Editor history: {', '.join(symbol.editor_history[:5])}" + + (f" and {len(symbol.editor_history) - 5} more..." if len(symbol.editor_history) > 5 else "")) + else: + print(" • Editor history: Not available") + + if hasattr(symbol, 'is_ai_authored'): + print(f" • AI authored: {'Yes' if symbol.is_ai_authored else 'No'}") + else: + print(" • AI authored: Not available") + +if __name__ == "__main__": + try: + print("Initializing codebase...") + + # Use current directory if it's a git repository + if os.path.exists(".git"): + print("Using current directory as repository...") + repo_path = os.getcwd() + repo_config = RepoConfig.from_repo_path(repo_path) + repo_operator = RepoOperator(repo_config=repo_config) + + project = ProjectConfig.from_repo_operator( + repo_operator=repo_operator, + programming_language=ProgrammingLanguage.PYTHON + ) + codebase = Codebase(projects=[project]) + else: + # Use from_repo method for a well-known repository + print("Using a sample repository...") + codebase = Codebase.from_repo( + repo_full_name="codegen-sh/codegen", + #commit="", # Using a specific commit for consistency + language="python" + ) + + print(f"Codebase loaded with {len(codebase.files)} files and {len(codebase.symbols)} symbols") + + # First run the analysis to gather attribution data + print("\nšŸ” Running AI impact analysis...") + run(codebase) + + # Then show examples of accessing attribution information + print_symbol_attribution(codebase) + + except Exception as e: + print(f"\nāŒ Error: {str(e)}") + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/pyproject.toml b/pyproject.toml index e2a7470d1..d3ce75054 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ dependencies = [ "hatch-vcs>=0.4.0", "hatchling>=1.25.0", "pyinstrument>=5.0.0", - "pip>=24.3.1", # This is needed for some NPM/YARN/PNPM post-install scripts to work! + "pip>=24.3.1", # This is needed for some NPM/YARN/PNPM post-install scripts to work! "rich-click>=1.8.5", "python-dotenv>=1.0.1", "giturlparse", diff --git a/src/codegen/extensions/attribution/3pp/__init__.py b/src/codegen/extensions/attribution/3pp/__init__.py new file mode 100644 index 000000000..4228e6fc3 --- /dev/null +++ b/src/codegen/extensions/attribution/3pp/__init__.py @@ -0,0 +1 @@ +"""Code for fetching attributions from 3rd party products.""" diff --git a/src/codegen/extensions/attribution/3pp/cursor.py b/src/codegen/extensions/attribution/3pp/cursor.py new file mode 100644 index 000000000..4fb534495 --- /dev/null +++ b/src/codegen/extensions/attribution/3pp/cursor.py @@ -0,0 +1,592 @@ +import asyncio +import json +import logging +import os +import platform +import shutil +import sqlite3 +from pathlib import Path + +import jwt +import requests + + +class Cursor: + def __init__(self, log_level=logging.INFO): + """Initialize the Cursor class with optional log level configuration.""" + logging.basicConfig(level=log_level) + self.logger = logging.getLogger("Cursor") + self.api_base_url = "https://cursor.com" + + def log(self, message, is_error=False): + """Log messages with appropriate level.""" + if is_error: + self.logger.error(message) + else: + self.logger.debug(message) + + def get_windows_username(self): + """Get Windows username when running in WSL environment.""" + try: + import subprocess + + result = subprocess.run(["cmd.exe", "/c", "echo", "%USERNAME%"], capture_output=True, text=True) + return result.stdout.strip() + except Exception as e: + self.log(f"Error getting Windows username: {e}", True) + return None + + def is_installed(self): + """Check if Cursor is installed on the system. + + Returns: + bool: True if installed, False otherwise + """ + # Check if the database path exists + db_path = self.get_cursor_db_path() + if not os.path.exists(db_path): + self.log("Cursor database not found", True) + return False + + # Check if the Cursor binary is installed + binary_path = self._get_binary_path() + if binary_path and not binary_path.exists(): + self.log("Cursor binary not found", True) + return False + + return True + + def _get_binary_path(self): + """Get the path to the Cursor binary based on platform.""" + try: + app_name = os.environ.get("VSCODE_APP_NAME", "") + folder_name = "Cursor Nightly" if app_name == "Cursor Nightly" else "Cursor" + + if platform.system() == "Windows": + # Check in Program Files + program_files = os.environ.get("ProgramFiles", "C:\\Program Files") + path = Path(program_files) / folder_name / "Cursor.exe" + if path.exists(): + return path + + # Check in PATH + cursor_path = shutil.which("cursor.exe") + if cursor_path: + return Path(cursor_path) + + elif platform.system() == "Darwin": # macOS + # Check in Applications + path = Path(f"/Applications/{folder_name}.app/Contents/MacOS/Cursor") + if path.exists(): + return path + + # Check in PATH + cursor_path = shutil.which("cursor") + if cursor_path: + return Path(cursor_path) + + else: # Linux and others + # Check in common locations + paths = [Path(f"/usr/bin/{folder_name.lower()}"), Path(f"/usr/local/bin/{folder_name.lower()}"), Path(os.path.expanduser(f"~/.local/bin/{folder_name.lower()}"))] + + for path in paths: + if path.exists(): + return path + + # Check in PATH + cursor_path = shutil.which(folder_name.lower()) + if cursor_path: + return Path(cursor_path) + + return None + except Exception as error: + self.log(f"Error finding Cursor binary: {error}", True) + return None + + def get_cursor_db_path(self): + """Determine the path to the Cursor database based on the current platform.""" + app_name = os.environ.get("VSCODE_APP_NAME", "") + folder_name = "Cursor Nightly" if app_name == "Cursor Nightly" else "Cursor" + + if platform.system() == "Windows": + return os.path.join(os.environ.get("APPDATA", ""), folder_name, "User", "globalStorage", "state.vscdb") + elif platform.system() == "Linux": + is_wsl = os.environ.get("VSCODE_REMOTE_NAME") == "wsl" + if is_wsl: + windows_username = self.get_windows_username() + if windows_username: + return os.path.join("/mnt/c/Users", windows_username, "AppData/Roaming", folder_name, "User/globalStorage/state.vscdb") + return os.path.join(os.path.expanduser("~"), ".config", folder_name, "User", "globalStorage", "state.vscdb") + elif platform.system() == "Darwin": # macOS + return os.path.join(os.path.expanduser("~"), "Library", "Application Support", folder_name, "User", "globalStorage", "state.vscdb") + + # Default fallback + return os.path.join(os.path.expanduser("~"), ".config", folder_name, "User", "globalStorage", "state.vscdb") + + async def read_auth_token(self): + """Retrieve and process the Cursor authentication token from the database.""" + try: + db_path = self.get_cursor_db_path() + + self.log(f"Platform: {platform.system()}") + self.log(f"Home directory: {os.path.expanduser('~')}") + self.log(f"Attempting to open database at: {db_path}") + self.log(f"Database path exists: {os.path.exists(db_path)}") + + if not os.path.exists(db_path): + self.log("Database file does not exist", True) + return None + + # Connect to SQLite database + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + self.log("Successfully opened database connection") + self.log("Executing SQL query for token...") + + cursor.execute("SELECT value FROM ItemTable WHERE key = 'cursorAuth/accessToken'") + result = cursor.fetchone() + + if not result: + self.log("No token found in database") + conn.close() + return None + + token = result[0] + self.log(f"Token length: {len(token)}") + self.log(f"Token starts with: {token[:20]}...") + + try: + decoded = jwt.decode(token, options={"verify_signature": False}) + self.log(f"JWT decoded successfully: {bool(decoded)}") + self.log(f"JWT payload exists: {bool(decoded)}") + self.log(f"JWT sub exists: {bool(decoded and 'sub' in decoded)}") + + if not decoded or "sub" not in decoded: + self.log(f"Invalid JWT structure: {decoded}", True) + conn.close() + return None + + sub = str(decoded["sub"]) + self.log(f"Sub value: {sub}") + user_id = sub.split("|")[1] + self.log(f"Extracted userId: {user_id}") + session_token = f"{user_id}%3A%3A{token}" + self.log(f"Created session token, length: {len(session_token)}") + conn.close() + return session_token + except Exception as error: + self.log(f"Error processing token: {error}", True) + self.log(f"Error details: {error.__class__.__name__}, {error!s}", True) + conn.close() + return None + except Exception as error: + self.log(f"Error opening database: {error}", True) + self.log(f"Database error details: {error!s}", True) + return None + + async def get_user_info(self): + """Get user information using the auth token. + + Returns: + dict: User information if successful, None otherwise + """ + token = await self.read_auth_token() + if not token: + self.log("No auth token available", True) + return None + + try: + headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} + + response = requests.get(f"{self.api_base_url}/api/v1/user", headers=headers) + + if response.status_code == 200: + user_data = response.json() + self.log(f"Successfully retrieved user info: {user_data}") + return user_data + else: + self.log(f"Failed to get user info: {response.status_code}", True) + self.log(f"Response: {response.text}", True) + return None + + except Exception as error: + self.log(f"Error getting user info: {error}", True) + self.log(f"Error details: {error.__class__.__name__}, {error!s}", True) + return None + + async def validate_token(self): + """Validate if the current token is valid.""" + user_info = await self.get_user_info() + return user_info is not None + + def get_cursor_storage_path(self): + """Determine the path to the Cursor storage directory based on the current platform.""" + app_name = os.environ.get("VSCODE_APP_NAME", "") + folder_name = "Cursor Nightly" if app_name == "Cursor Nightly" else "Cursor" + + if platform.system() == "Windows": + return os.path.join(os.environ.get("APPDATA", ""), folder_name, "User", "workspaceStorage") + elif platform.system() == "Linux": + is_wsl = os.environ.get("VSCODE_REMOTE_NAME") == "wsl" + if is_wsl: + windows_username = self.get_windows_username() + if windows_username: + return os.path.join("/mnt/c/Users", windows_username, "AppData/Roaming", folder_name, "User/workspaceStorage") + return os.path.join(os.path.expanduser("~"), ".config", folder_name, "User", "workspaceStorage") + elif platform.system() == "Darwin": # macOS + return os.path.join(os.path.expanduser("~"), "Library", "Application Support", folder_name, "User", "workspaceStorage") + + # Default fallback + return os.path.join(os.path.expanduser("~"), ".config", folder_name, "User", "workspaceStorage") + + def get_global_storage_path(self): + """Determine the path to the Cursor global storage directory.""" + app_name = os.environ.get("VSCODE_APP_NAME", "") + folder_name = "Cursor Nightly" if app_name == "Cursor Nightly" else "Cursor" + + if platform.system() == "Windows": + return os.path.join(os.environ.get("APPDATA", ""), folder_name, "User", "globalStorage", "state.vscdb") + elif platform.system() == "Linux": + is_wsl = os.environ.get("VSCODE_REMOTE_NAME") == "wsl" + if is_wsl: + windows_username = self.get_windows_username() + if windows_username: + return os.path.join("/mnt/c/Users", windows_username, "AppData/Roaming", folder_name, "User/globalStorage/state.vscdb") + return os.path.join(os.path.expanduser("~"), ".config", folder_name, "User", "globalStorage", "state.vscdb") + elif platform.system() == "Darwin": # macOS + return os.path.join(os.path.expanduser("~"), "Library", "Application Support", folder_name, "User", "globalStorage", "state.vscdb") + + # Default fallback + return os.path.join(os.path.expanduser("~"), ".config", folder_name, "User", "globalStorage", "state.vscdb") + + async def get_workspaces(self): + """Get all workspaces from the Cursor storage directory.""" + try: + workspace_path = self.get_cursor_storage_path() + self.log(f"Looking for workspaces in: {workspace_path}") + + workspaces = [] + workspace_dir = Path(workspace_path) + + if not workspace_dir.exists(): + self.log(f"Workspace directory does not exist: {workspace_path}", True) + return [] + + for entry in workspace_dir.iterdir(): + if entry.is_dir(): + db_path = entry.joinpath("state.vscdb") + workspace_json_path = entry.joinpath("workspace.json") + + # Skip if state.vscdb doesn't exist + if not db_path.exists(): + self.log(f"Skipping {entry.name}: no state.vscdb found") + continue + + workspace_info = {"id": entry.name, "path": str(entry), "dbPath": str(db_path)} + + # Try to get workspace name from workspace.json if it exists + if workspace_json_path.exists(): + try: + workspace_data = json.loads(workspace_json_path.read_text()) + if "folder" in workspace_data: + workspace_info["name"] = Path(workspace_data["folder"]).name + except Exception as e: + self.log(f"Error reading workspace.json: {e}", True) + + if "name" not in workspace_info: + workspace_info["name"] = entry.name + + workspaces.append(workspace_info) + + return workspaces + + except Exception as e: + self.log(f"Failed to get workspaces: {e}", True) + return [] + + async def get_workspace_chat_data(self, workspace_id: str): + """Get chat data for a specific workspace.""" + try: + workspace_path = self.get_cursor_storage_path() + db_path = os.path.join(workspace_path, workspace_id, "state.vscdb") + + if not os.path.exists(db_path): + self.log(f"Database does not exist: {db_path}", True) + return None + + # Connect to SQLite database + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row # This allows accessing columns by name + cursor = conn.cursor() + + # First, check what tables exist in the database + cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") + tables = [table[0] for table in cursor.fetchall()] + self.log(f"Tables in database: {tables}") + + response = {} + + # Determine the correct table name (ItemTable or Item) + item_table = "ItemTable" if "ItemTable" in tables else "Item" + + # Get chat data + try: + # Look for chat data with different possible key patterns + chat_keys = ["workbench.panel.aichat.view.aichat.chatdata", "workbench.panel.chat.view.chat.chatdata", "workbench.view.chat.chatdata"] + + chat_result = None + for chat_key in chat_keys: + cursor.execute(f"SELECT value FROM {item_table} WHERE key = ?", (chat_key,)) + chat_result = cursor.fetchone() + if chat_result: + self.log(f"Found chat data with key: {chat_key}") + break + + # Look for composer data + composer_keys = ["composer.composerData", "cursor.composerData"] + + composer_result = None + for composer_key in composer_keys: + cursor.execute(f"SELECT value FROM {item_table} WHERE key = ?", (composer_key,)) + composer_result = cursor.fetchone() + if composer_result: + self.log(f"Found composer data with key: {composer_key}") + break + except Exception as e: + self.log(f"Error querying database: {e}", True) + chat_result = None + composer_result = None + + conn.close() + + # Process chat data + if chat_result: + try: + chat_data = json.loads(chat_result["value"]) + response["chats"] = chat_data + self.log(f"Successfully parsed chat data with {len(chat_data.get('tabs', []))} tabs") + except json.JSONDecodeError as e: + self.log(f"Error parsing chat data: {e}", True) + else: + self.log("No chat data found in database") + + # Process composer data + if composer_result: + try: + composers = json.loads(composer_result["value"]) + self.log(f"Found {len(composers.get('allComposers', []))} composers") + response["composers"] = composers + except json.JSONDecodeError as e: + self.log(f"Error parsing composer data: {e}", True) + else: + self.log("No composer data found in database") + + return response + + except Exception as e: + self.log(f"Failed to get workspace data: {e}", True) + import traceback + + self.log(traceback.format_exc(), True) + return None + + async def search_chat_history(self, query: str, search_type: str = "all"): + """Search across all workspaces for chat history matching the query. + + Args: + query: The search term to look for + search_type: Type of logs to search - 'all', 'chat', or 'composer' + + Returns: + list: List of search results with matching content + """ + try: + if not query: + self.log("No search query provided", True) + return [] + + results = [] + workspaces = await self.get_workspaces() + + for workspace in workspaces: + workspace_id = workspace["id"] + + try: + workspace_data = await self.get_workspace_chat_data(workspace_id) + if not workspace_data: + continue + + # Search in chat data + if search_type in ["all", "chat"] and "chats" in workspace_data: + chat_data = workspace_data["chats"] + for tab in chat_data.get("tabs", []): + has_match = False + matching_text = "" + + # Search in chat title + if tab.get("chatTitle", "").lower().find(query.lower()) != -1: + has_match = True + matching_text = tab.get("chatTitle", "") + + # Search in bubbles/messages + if not has_match: + for bubble in tab.get("bubbles", []): + if bubble.get("text", "").lower().find(query.lower()) != -1: + has_match = True + matching_text = bubble.get("text", "") + break + + if has_match: + results.append( + { + "workspaceId": workspace_id, + "workspaceName": workspace.get("name", workspace_id), + "chatId": tab.get("tabId", ""), + "chatTitle": tab.get("chatTitle", f"Chat {tab.get('tabId', '')[:8]}"), + "timestamp": tab.get("lastSendTime", ""), + "matchingText": matching_text, + "type": "chat", + } + ) + + # Search in composer data + if search_type in ["all", "composer"] and "composers" in workspace_data: + composer_data = workspace_data["composers"] + for composer in composer_data.get("allComposers", []): + has_match = False + matching_text = "" + + # Search in composer text/title + if composer.get("text", "").lower().find(query.lower()) != -1: + has_match = True + matching_text = composer.get("text", "") + + # Search in conversation + if not has_match and "conversation" in composer: + for message in composer.get("conversation", []): + if message.get("text", "").lower().find(query.lower()) != -1: + has_match = True + matching_text = message.get("text", "") + break + + if has_match: + results.append( + { + "workspaceId": workspace_id, + "workspaceName": workspace.get("name", workspace_id), + "chatId": composer.get("composerId", ""), + "chatTitle": composer.get("text", f"Composer {composer.get('composerId', '')[:8]}"), + "timestamp": composer.get("lastUpdatedAt", composer.get("createdAt", "")), + "matchingText": matching_text, + "type": "composer", + } + ) + + except Exception as e: + self.log(f"Error searching workspace {workspace_id}: {e}", True) + + # Sort results by timestamp, newest first + results.sort(key=lambda x: x.get("timestamp", ""), reverse=True) + return results + + except Exception as e: + self.log(f"Failed to search chat history: {e}", True) + return [] + + +async def main(): + cursor = Cursor(log_level=logging.DEBUG) + + # Check if Cursor is installed + if not cursor.is_installed(): + print("Cursor is not installed or not properly configured") + return + + token = await cursor.read_auth_token() + print(f"Token: {token}") + + # Get all workspaces + workspaces = await cursor.get_workspaces() + print(f"Found {len(workspaces)} workspaces:") + for workspace in workspaces: + print(f" - {workspace['name']} ({workspace['id']})") + + # If workspaces were found, get chat data for the first one + if workspaces: + for workspace in workspaces: + workspace_id = workspace["id"] + print(f"\nGetting chat data for workspace: {workspace['name']}") + + chat_data = await cursor.get_workspace_chat_data(workspace_id) + + if chat_data: + # Print summary of chat data + if "chats" in chat_data: + chats = chat_data["chats"] + print(f"\nFound {len(chats.get('tabs', []))} chat tabs") + + for i, tab in enumerate(chats.get("tabs", [])): + bubbles = tab.get("bubbles", []) + print(f" - Tab {i + 1}: {tab.get('chatTitle', 'Untitled')} ({len(bubbles)} messages)") + + # Print a sample of messages from this chat + if bubbles: + print(" Sample messages:") + for j, bubble in enumerate(bubbles[:3]): # Show first 3 messages + msg_type = "AI" if bubble.get("type") == "ai" else "User" + text = bubble.get("text", "") + # Truncate long messages + if len(text) > 100: + text = text[:97] + "..." + print(f" {msg_type}: {text}") + if len(bubbles) > 3: + print(f" ... and {len(bubbles) - 3} more messages") + + # Print summary of composer data + if "composers" in chat_data: + composers = chat_data["composers"] + print(f"\nFound {len(composers.get('allComposers', []))} composers") + + for i, composer in enumerate(composers.get("allComposers", [])): + conversation = composer.get("conversation", []) + print(f" - Composer {i + 1}: {composer.get('text', 'Untitled')} ({len(conversation)} messages)") + + # Print a sample of messages from this composer + if conversation: + print(" Sample messages:") + for j, message in enumerate(conversation[:3]): # Show first 3 messages + msg_type = "AI" if message.get("type") == 2 else "User" + text = message.get("text", "") + # Truncate long messages + if len(text) > 100: + text = text[:97] + "..." + print(f" {msg_type}: {text}") + if len(conversation) > 3: + print(f" ... and {len(conversation) - 3} more messages") + else: + print("No chat data found for this workspace") + else: + print("No workspaces found") + + # Search for chat history + search_query = "def" # Example search term + print(f"\nSearching chat history for '{search_query}'...") + search_results = await cursor.search_chat_history(search_query) + + if search_results: + print(f"Found {len(search_results)} results:") + for i, result in enumerate(search_results[:10]): # Show first 10 results + print(f" {i+1}. [{result['type']}] {result['chatTitle']} ({result['workspaceName']})") + # Show a snippet of the matching text + matching_text = result['matchingText'] + if len(matching_text) > 100: + matching_text = matching_text[:97] + "..." + print(f" Match: {matching_text}") + else: + print("No search results found") + + +if __name__ == "__main__": + # For testing - lets pull prompts and responses from local cursor db + asyncio.run(main()) diff --git a/src/codegen/extensions/attribution/3pp/windsurf.py b/src/codegen/extensions/attribution/3pp/windsurf.py new file mode 100644 index 000000000..df66181c6 --- /dev/null +++ b/src/codegen/extensions/attribution/3pp/windsurf.py @@ -0,0 +1,186 @@ +import asyncio +import json +import logging +import os +import platform +import shutil +from pathlib import Path + +import requests + + +class Windsurf: + def __init__(self, log_level=logging.INFO): + """Initialize the Windsurf class for Codeium integration.""" + logging.basicConfig(level=log_level) + self.logger = logging.getLogger("Windsurf") + self.api_base_url = "https://api.codeium.com" + self.user_data_path = self._get_user_data_path() + + def log(self, message, is_error=False): + """Log messages with appropriate level.""" + if is_error: + self.logger.error(message) + else: + self.logger.debug(message) + + def _get_user_data_path(self): + """Get the path to Windsurf/Codeium user data based on platform.""" + if platform.system() == "Windows": + return Path(os.environ.get("APPDATA", "")) / "Codeium" + elif platform.system() == "Darwin": # macOS + return Path.home() / "Library" / "Application Support" / "Codeium" + else: # Linux and others + return Path.home() / ".config" / "Codeium" + + def is_installed(self): + """Check if Windsurf/Codeium is installed on the system. + + Returns: + bool: True if installed, False otherwise + """ + # Check if the user data directory exists + if not self.user_data_path.exists(): + self.log("Codeium user data directory not found", True) + return False + + # Check if auth file exists + auth_path = self.get_auth_token_path() + if not auth_path.exists(): + self.log("Codeium auth file not found", True) + return False + + # Check if config file exists + config_path = self.get_config_path() + if not config_path.exists(): + self.log("Codeium config file not found", True) + return False + + # Check if the Codeium binary is installed + binary_path = self._get_binary_path() + if binary_path and not binary_path.exists(): + self.log("Codeium binary not found", True) + return False + + return True + + def _get_binary_path(self): + """Get the path to the Codeium binary based on platform.""" + try: + if platform.system() == "Windows": + # Check in Program Files + program_files = os.environ.get("ProgramFiles", "C:\\Program Files") + path = Path(program_files) / "Codeium" / "Codeium.exe" + if path.exists(): + return path + + # Check in PATH + codeium_path = shutil.which("codeium.exe") + if codeium_path: + return Path(codeium_path) + + elif platform.system() == "Darwin": # macOS + # Check in Applications + path = Path("/Applications/Codeium.app/Contents/MacOS/Codeium") + if path.exists(): + return path + + # Check in PATH + codeium_path = shutil.which("codeium") + if codeium_path: + return Path(codeium_path) + + else: # Linux and others + # Check in common locations + paths = [Path("/usr/bin/codeium"), Path("/usr/local/bin/codeium"), Path(os.path.expanduser("~/.local/bin/codeium"))] + + for path in paths: + if path.exists(): + return path + + # Check in PATH + codeium_path = shutil.which("codeium") + if codeium_path: + return Path(codeium_path) + + return None + except Exception as e: + self.log(f"Error finding Codeium binary: {e!s}", True) + return None + + def get_config_path(self): + """Get the path to the Codeium configuration file.""" + return self.user_data_path / "config.json" + + def get_auth_token_path(self): + """Get the path to the authentication token file.""" + return self.user_data_path / "auth.json" + + def get_auth_token(self): + """Read the authentication token from the auth file.""" + try: + auth_path = self.get_auth_token_path() + self.log(f"Reading auth token from: {auth_path}") + + if not auth_path.exists(): + self.log("Auth token file does not exist", True) + return None + + with open(auth_path) as f: + auth_data = json.load(f) + + if "api_key" in auth_data: + return auth_data["api_key"] + else: + self.log("No API key found in auth data", True) + return None + + except Exception as e: + self.log(f"Error reading auth token: {e!s}", True) + return None + + async def get_user_info(self): + """Get user information using the auth token.""" + token = self.get_auth_token() + if not token: + return None + + try: + headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} + + response = requests.get(f"{self.api_base_url}/user/info", headers=headers) + if response.status_code == 200: + self.log(f"User info response: {response.json()}") + return response.json() + else: + self.log(f"Failed to get user info: {response.status_code}", True) + return None + + except Exception as e: + self.log(f"Error getting user info: {e!s}", True) + return None + + async def validate_token(self): + """Validate if the current token is valid.""" + user_info = await self.get_user_info() + return user_info is not None + + +async def main(): + windsurf = Windsurf(log_level=logging.DEBUG) + + # Check if Codeium is installed + if not windsurf.is_installed(): + print("Codeium is not installed or not properly configured") + return + + token = windsurf.read_auth_token() + print(f"Token: {token}") + + is_valid = await windsurf.validate_token() + print(f"Token is valid: {is_valid}") + + +if __name__ == "__main__": + # TODO: don't have windsurf at the moment, test later if feature is needed + asyncio.run(main()) diff --git a/src/codegen/extensions/attribution/cli.py b/src/codegen/extensions/attribution/cli.py new file mode 100644 index 000000000..bf4370e6a --- /dev/null +++ b/src/codegen/extensions/attribution/cli.py @@ -0,0 +1,153 @@ +import json +import os + +import pygit2 + +import codegen +from codegen import Codebase +from codegen.extensions.attribution.main import add_attribution_to_symbols, analyze_ai_impact + + +def diagnose_repository(codebase): + """Print diagnostic information about the repository.""" + try: + repo_path = codebase.ctx.projects[0].repo_operator.repo_path + print("\nšŸ” Repository Diagnostics:") + print(f"Repository path: {repo_path}") + + # Check if it's a git repository + if not os.path.exists(os.path.join(repo_path, ".git")): + print("āš ļø No .git directory found. This might not be a git repository.") + return + + try: + repo = pygit2.Repository(repo_path) + + # Check if repository has commits + try: + head = repo.head + head_commit = repo.get(head.target) + print(f"Repository has a HEAD commit: {head_commit.id}") + print(f"HEAD commit author: {head_commit.author.name} <{head_commit.author.email}>") + print(f"HEAD commit message (first 5 lines only): {'\n'.join(head_commit.message.strip().split('\n')[:5])}") + print("...") + # Check if it's a shallow clone + if os.path.exists(os.path.join(repo_path, ".git", "shallow")): + print("āš ļø This appears to be a shallow clone, which may have limited history.") + + # Try to count commits + commit_count = 0 + for _ in repo.walk(head.target, pygit2.GIT_SORT_TIME): + commit_count += 1 + if commit_count >= 10: # Just check first 10 + break + + if commit_count == 0: + print("āš ļø No commits found in the repository.") + else: + print(f"Found at least {commit_count} commits in the repository.") + + except (pygit2.GitError, KeyError) as e: + print(f"āš ļø Error accessing HEAD: {e}") + print("This repository might be empty or corrupted.") + + except Exception as e: + print(f"āš ļø Error opening repository with pygit2: {e}") + + except Exception as e: + print(f"āš ļø Error during repository diagnosis: {e}") + + +@codegen.function("analyze-ai-impact") +def run(codebase: Codebase): + """Analyze the impact of AI on the codebase. + + This function: + 1. Analyzes git history to identify AI contributions + 2. Identifies which parts of the codebase were written by AI + 3. Determines the impact of AI-written code + 4. Generates a report with statistics and visualizations + + Run the analysis using the codegen CLI: + codegen analyze-ai-impact + + Or from script: + from codegen.extensions.attribution.cli import run + codebase = Codebase.... + run(codebase) + """ + print("šŸ¤– Analyzing AI impact on codebase...") + + # Run repository diagnostics first + diagnose_repository(codebase) + + # Default AI authors to track (and ci bots) + ai_authors = ["renovate[bot]", "dependabot[bot]", "github-actions[bot]", "devin-ai-integration[bot]"] + + # Run the analysis + results = analyze_ai_impact(codebase, ai_authors) + + # Print list of all contributors + print("\nšŸ‘„ All Contributors:") + contributors = results.get("contributors", []) + if contributors: + # Sort by commit count (descending) + for author, count in contributors: + is_ai = any(ai_name in author for ai_name in ai_authors) + ai_indicator = "šŸ¤–" if is_ai else "šŸ‘¤" + print(f" {ai_indicator} {author}: {count} commits") + else: + print(" No contributors found.") + + # Print summary statistics + stats = results["stats"] + print("\nšŸ“Š AI Contribution Summary:") + print(f"Total commits: {stats['total_commits']}") + print(f"AI commits: {stats['ai_commits']} ({stats['ai_percentage']:.1f}%)") + + if stats["total_file_count"] > 0: + ai_file_percentage = stats["ai_file_count"] / stats["total_file_count"] * 100 + else: + ai_file_percentage = 0.0 + print(f"Files with >50% AI contribution: {stats['ai_file_count']} of {stats['total_file_count']} ({ai_file_percentage:.1f}%)") + + if results["total_symbol_count"] > 0: + ai_symbol_percentage = results["ai_symbol_count"] / results["total_symbol_count"] * 100 + else: + ai_symbol_percentage = 0.0 + print(f"AI-touched symbols: {results['ai_symbol_count']} of {results['total_symbol_count']} ({ai_symbol_percentage:.1f}%)") + + # Print high-impact AI symbols + print("\nšŸ” High-Impact AI-Written Code:") + if results["high_impact_symbols"]: + for symbol in results["high_impact_symbols"][:10]: # Show top 10 + print(f" • {symbol['name']} ({symbol['filepath']})") + print(f" - Used by {symbol['usage_count']} other symbols") + print(f" - Last edited by: {symbol['last_editor']}") + else: + print(" No high-impact AI-written code found.") + + # Print top AI files + print("\nšŸ“ Top Files by AI Contribution:") + if stats["top_ai_files"]: + for file_path, percentage in stats["top_ai_files"][:10]: # Show top 10 + print(f" • {file_path}: {percentage:.1f}% AI contribution") + else: + print(" No files with AI contributions found.") + + # Save detailed results to file + output_path = "ai_impact_analysis.json" + with open(output_path, "w") as f: + json.dump(results, f, indent=2) + + print(f"\nāœ… Detailed analysis saved to {output_path}") + + # Add attribution to symbols + print("\nšŸ·ļø Adding attribution information to symbols...") + add_attribution_to_symbols(codebase, ai_authors) + print("āœ… Attribution information added to symbols") + + print("\nYou can now access attribution information on symbols:") + print(" • symbol.last_editor - The last person who edited the symbol") + print(" • symbol.editor_history - List of all editors who have touched the symbol") + print(" • symbol.is_ai_authored - Whether the symbol was authored by AI") diff --git a/src/codegen/extensions/attribution/git_history.py b/src/codegen/extensions/attribution/git_history.py new file mode 100644 index 000000000..fc0625c05 --- /dev/null +++ b/src/codegen/extensions/attribution/git_history.py @@ -0,0 +1,326 @@ +import time +from collections import defaultdict +from datetime import datetime +from typing import Optional + +import pygit2 + +from codegen.sdk.core.codebase import Codebase +from codegen.sdk.core.symbol import Symbol + + +class GitAttributionTracker: + """Tracks attribution information for code symbols based on git history.""" + + def __init__(self, codebase: Codebase, ai_authors: Optional[list[str]] = None): + """Initialize the attribution tracker. + + Args: + codebase: The codebase to analyze + ai_authors: List of author names/emails to track as AI contributors + (defaults to ['devin[bot]', 'codegen[bot]']) + """ + self.codebase = codebase + self.repo_path = codebase.ctx.projects[0].repo_operator.repo_path + self.repo = pygit2.Repository(self.repo_path) + + # Default AI authors if none provided + self.ai_authors = ai_authors or ['devin[bot]', 'codegen[bot]'] + + # Cache structures + self._file_history = {} # file path -> list of commit info + self._symbol_history = {} # symbol id -> list of commit info + self._author_contributions = defaultdict(list) # author -> list of commit info + + # Track if history has been built + self._history_built = False + + def build_history(self, max_commits: Optional[int] = None) -> None: + """Build the git history for the codebase. + + Args: + max_commits: Maximum number of commits to process (None for all) + """ + start_time = time.time() + print(f"Building git history for {self.repo_path}...") + + # Check if repository exists and has commits + try: + head = self.repo.head + except Exception as e: + print(f"āš ļø Error accessing repository head: {e}") + print("This might be a shallow clone or a repository without history.") + self._history_built = True + return + + # Walk through commit history + commit_count = 0 + author_set = set() + + try: + for commit in self.repo.walk(self.repo.head.target, pygit2.GIT_SORT_TIME): + # Track unique authors + author_id = f"{commit.author.name} <{commit.author.email}>" + author_set.add(author_id) + + # Process each diff in the commit + if len(commit.parents) > 0: + try: + diff = self.repo.diff(commit.parents[0], commit) + self._process_commit(commit, diff) + except Exception as e: + print(f"Error processing commit {commit.id}: {e}") + else: + # Initial commit (no parents) + try: + # For initial commit, compare with empty tree + diff = commit.tree.diff_to_tree(context_lines=0) + self._process_commit(commit, diff) + except Exception as e: + print(f"Error processing initial commit {commit.id}: {e}") + + commit_count += 1 + if max_commits and commit_count >= max_commits: + break + + # Progress indicator + if commit_count % 100 == 0: + print(f"Processed {commit_count} commits...") + + except Exception as e: + print(f"āš ļø Error walking commit history: {e}") + + self._history_built = True + elapsed = time.time() - start_time + + # Print diagnostic information + print(f"Finished building history in {elapsed:.2f} seconds.") + print(f"Processed {commit_count} commits from {len(author_set)} unique authors.") + print(f"Found {len(self._file_history)} files with history.") + print(f"Found {len(self._author_contributions)} contributors.") + + if len(self._author_contributions) > 0: + print("Top contributors:") + top_contributors = sorted( + [(author, len(commits)) for author, commits in self._author_contributions.items()], + key=lambda x: x[1], + reverse=True + )[:5] + for author, count in top_contributors: + print(f" • {author}: {count} commits") + else: + print("āš ļø No contributors found. This might be due to:") + print(" 1. Using a shallow clone without history") + print(" 2. Repository access issues") + print(" 3. Empty repository or no commits") + + def _process_commit(self, commit, diff) -> None: + """Process a single commit and its diff.""" + author_name = commit.author.name + author_email = commit.author.email + author_id = f"{author_name} <{author_email}>" + timestamp = commit.author.time + commit_id = str(commit.id) + + commit_info = { + 'author': author_name, + 'email': author_email, + 'timestamp': timestamp, + 'commit_id': commit_id, + 'message': commit.message.strip(), + } + + # Track by author + self._author_contributions[author_id].append(commit_info) + + # Track by file + for patch in diff: + file_path = patch.delta.new_file.path + + # Skip if not a source file we care about + if not self._is_tracked_file(file_path): + continue + + if file_path not in self._file_history: + self._file_history[file_path] = [] + + file_commit = commit_info.copy() + file_commit['file_path'] = file_path + self._file_history[file_path].append(file_commit) + + def _is_tracked_file(self, file_path: str) -> bool: + """Check if a file should be tracked based on extension.""" + # Get file extensions from the codebase + extensions = self.codebase.ctx.extensions + + # If we can't determine extensions, track common source files + if not extensions: + extensions = ['.py', '.js', '.ts', '.tsx', '.jsx'] + + return any(file_path.endswith(ext) for ext in extensions) + + def _ensure_history_built(self) -> None: + """Ensure git history has been built.""" + if not self._history_built: + self.build_history() + + def map_symbols_to_history(self) -> None: + """Map symbols in the codebase to their git history.""" + self._ensure_history_built() + + print("Mapping symbols to git history...") + start_time = time.time() + + # For each symbol, find commits that modified its file + for symbol in self.codebase.symbols: + if not hasattr(symbol, 'filepath') or not symbol.filepath: + continue + + symbol_id = f"{symbol.filepath}:{symbol.name}" + self._symbol_history[symbol_id] = [] + + # Get file history + file_history = self._file_history.get(symbol.filepath, []) + + # For now, just associate all file changes with the symbol + # A more sophisticated approach would use line ranges + for commit in file_history: + self._symbol_history[symbol_id].append(commit) + + elapsed = time.time() - start_time + print(f"Finished mapping symbols in {elapsed:.2f} seconds.") + + def get_symbol_history(self, symbol: Symbol) -> list[dict]: + """Get the edit history for a symbol. + + Args: + symbol: The symbol to get history for + + Returns: + List of commit information dictionaries + """ + self._ensure_history_built() + + if not hasattr(symbol, 'filepath') or not symbol.filepath: + return [] + + symbol_id = f"{symbol.filepath}:{symbol.name}" + return self._symbol_history.get(symbol_id, []) + + def get_symbol_last_editor(self, symbol: Symbol) -> Optional[str]: + """Get the last person who edited a symbol. + + Args: + symbol: The symbol to check + + Returns: + Author name or None if no history found + """ + history = self.get_symbol_history(symbol) + if not history: + return None + + # Sort by timestamp (newest first) and return the author + sorted_history = sorted(history, key=lambda x: x['timestamp'], reverse=True) + return sorted_history[0]['author'] + + def get_ai_contribution_stats(self) -> dict: + """Get statistics about AI contributions to the codebase. + + Returns: + Dictionary with AI contribution statistics + """ + self._ensure_history_built() + + # Count AI commits by file + ai_file_commits = defaultdict(int) + total_file_commits = defaultdict(int) + + for file_path, commits in self._file_history.items(): + for commit in commits: + total_file_commits[file_path] += 1 + if commit['author'] in self.ai_authors or commit['email'] in self.ai_authors: + ai_file_commits[file_path] += 1 + + # Find files with highest AI contribution percentage + ai_contribution_percentage = {} + for file_path, total in total_file_commits.items(): + if total > 0: + ai_contribution_percentage[file_path] = (ai_file_commits[file_path] / total) * 100 + + # Get top files by AI contribution + top_ai_files = sorted( + ai_contribution_percentage.items(), + key=lambda x: x[1], + reverse=True + )[:20] + + # Count total AI commits + ai_commits = sum( + len(commits) for author, commits in self._author_contributions.items() + if any(name in author for name in self.ai_authors) + ) + + total_commits = sum(len(commits) for commits in self._author_contributions.values()) + + # Calculate AI percentage safely + if total_commits > 0: + ai_percentage = (ai_commits / total_commits) * 100 + else: + ai_percentage = 0.0 + + return { + 'total_commits': total_commits, + 'ai_commits': ai_commits, + 'ai_percentage': ai_percentage, + 'top_ai_files': top_ai_files, + 'ai_file_count': len([f for f, p in ai_contribution_percentage.items() if p > 50]), + 'total_file_count': len(total_file_commits), + } + + def get_ai_touched_symbols(self) -> list[Symbol]: + """Get all symbols that have been touched by AI authors. + + Returns: + List of symbols that have been edited by AI authors + """ + self._ensure_history_built() + + ai_symbols = [] + + for symbol in self.codebase.symbols: + history = self.get_symbol_history(symbol) + + # Check if any commit is from an AI author + if any( + commit['author'] in self.ai_authors or commit['email'] in self.ai_authors + for commit in history + ): + ai_symbols.append(symbol) + + return ai_symbols + + def get_ai_contribution_timeline(self) -> list[tuple[datetime, int]]: + """Get a timeline of AI contributions over time. + + Returns: + List of (datetime, count) tuples showing AI contributions over time + """ + self._ensure_history_built() + + # Group commits by month + monthly_counts = defaultdict(int) + + for author, commits in self._author_contributions.items(): + if any(name in author for name in self.ai_authors): + for commit in commits: + # Convert timestamp to year-month + dt = datetime.fromtimestamp(commit['timestamp']) + month_key = f"{dt.year}-{dt.month:02d}" + monthly_counts[month_key] += 1 + + # Sort by date + timeline = sorted(monthly_counts.items()) + + # Convert to datetime objects + return [(datetime.strptime(month, "%Y-%m"), count) for month, count in timeline] diff --git a/src/codegen/extensions/attribution/main.py b/src/codegen/extensions/attribution/main.py new file mode 100644 index 000000000..5f81169ac --- /dev/null +++ b/src/codegen/extensions/attribution/main.py @@ -0,0 +1,103 @@ +from typing import Optional + +from codegen.extensions.attribution.git_history import GitAttributionTracker +from codegen.sdk.core.codebase import Codebase + + +def analyze_ai_impact( + codebase: Codebase, + ai_authors: Optional[list[str]] = None, + max_commits: Optional[int] = None +) -> dict: + """Analyze the impact of AI on a codebase. + + Args: + codebase: The codebase to analyze + ai_authors: List of author names/emails to track as AI contributors + (defaults to ['devin[bot]', 'codegen[bot]']) + max_commits: Maximum number of commits to process (None for all) + + Returns: + Dictionary with analysis results + """ + tracker = GitAttributionTracker(codebase, ai_authors) + tracker.build_history(max_commits) + tracker.map_symbols_to_history() + + # Get basic stats + stats = tracker.get_ai_contribution_stats() + + # Get AI-touched symbols + ai_symbols = tracker.get_ai_touched_symbols() + + # Find high-impact AI symbols (those with many dependents) + high_impact_symbols = [] + for symbol in ai_symbols: + if hasattr(symbol, 'usages') and len(symbol.usages) > 5: + high_impact_symbols.append({ + 'name': symbol.name, + 'filepath': symbol.filepath, + 'usage_count': len(symbol.usages), + 'last_editor': tracker.get_symbol_last_editor(symbol) + }) + + # Sort by usage count + high_impact_symbols.sort(key=lambda x: x['usage_count'], reverse=True) + + # Get timeline data + timeline = tracker.get_ai_contribution_timeline() + timeline_data = [ + {'date': dt.strftime('%Y-%m'), 'count': count} + for dt, count in timeline + ] + + # Get list of all contributors with commit counts + contributors = [] + for author_id, commits in tracker._author_contributions.items(): + contributors.append((author_id, len(commits))) + + # Sort by commit count (descending) + contributors.sort(key=lambda x: x[1], reverse=True) + + return { + 'stats': stats, + 'ai_symbol_count': len(ai_symbols), + 'total_symbol_count': len(list(codebase.symbols)), + 'high_impact_symbols': high_impact_symbols[:20], # Top 20 + 'timeline': timeline_data, + 'contributors': contributors, + } + + +def add_attribution_to_symbols(codebase: Codebase, ai_authors: Optional[list[str]] = None) -> None: + """Add attribution information to symbols in the codebase. + + This adds the following attributes to each symbol: + - last_editor: The name of the last person who edited the symbol + - editor_history: List of all editors who have touched the symbol + + Args: + codebase: The codebase to analyze + ai_authors: List of author names/emails to track as AI contributors + """ + tracker = GitAttributionTracker(codebase, ai_authors) + tracker.build_history() + tracker.map_symbols_to_history() + + # Add attribution to each symbol + for symbol in codebase.symbols: + history = tracker.get_symbol_history(symbol) + + # Add last editor + if history: + sorted_history = sorted(history, key=lambda x: x['timestamp'], reverse=True) + symbol.last_editor = sorted_history[0]['author'] + + # Add editor history (unique editors) + editors = {commit['author'] for commit in history} + symbol.editor_history = list(editors) + + # Add is_ai_authored flag + symbol.is_ai_authored = any( + editor in tracker.ai_authors for editor in symbol.editor_history + ) From d0d80ec7ddd5fd7c4914d3af185588efad77d086 Mon Sep 17 00:00:00 2001 From: tomcodgen <191515280+tomcodgen@users.noreply.github.com> Date: Tue, 4 Mar 2025 01:33:33 +0000 Subject: [PATCH 2/7] Automated pre-commit update --- .../examples/ai_impact_analysis/run.py | 33 ++++------ .../attributions/symbol_attribution.py | 51 ++++++++------- pyproject.toml | 2 +- .../extensions/attribution/3pp/cursor.py | 4 +- .../extensions/attribution/git_history.py | 62 +++++++------------ src/codegen/extensions/attribution/main.py | 44 +++++-------- 6 files changed, 79 insertions(+), 117 deletions(-) diff --git a/codegen-examples/examples/ai_impact_analysis/run.py b/codegen-examples/examples/ai_impact_analysis/run.py index 7ff9026a8..c7cb08488 100644 --- a/codegen-examples/examples/ai_impact_analysis/run.py +++ b/codegen-examples/examples/ai_impact_analysis/run.py @@ -12,7 +12,7 @@ if __name__ == "__main__": try: print("Initializing codebase...") - + # Option A: Use current directory if it's a git repository if os.path.exists(".git"): print("Using current directory as repository...") @@ -20,50 +20,41 @@ repo_path = os.getcwd() repo_config = RepoConfig.from_repo_path(repo_path) repo_operator = RepoOperator(repo_config=repo_config) - + # Initialize codebase with a project config - project = ProjectConfig.from_repo_operator( - repo_operator=repo_operator, - programming_language=ProgrammingLanguage.PYTHON - ) + project = ProjectConfig.from_repo_operator(repo_operator=repo_operator, programming_language=ProgrammingLanguage.PYTHON) codebase = Codebase(projects=[project]) else: # Option B: Try to find a git repository in parent directories print("Searching for git repository in parent directories...") current_dir = os.getcwd() found_git = False - + while current_dir != os.path.dirname(current_dir): # Stop at root if os.path.exists(os.path.join(current_dir, ".git")): print(f"Found git repository at {current_dir}") repo_config = RepoConfig.from_repo_path(current_dir) repo_operator = RepoOperator(repo_config=repo_config) - + # Initialize codebase with a project config - project = ProjectConfig.from_repo_operator( - repo_operator=repo_operator, - programming_language=ProgrammingLanguage.PYTHON - ) + project = ProjectConfig.from_repo_operator(repo_operator=repo_operator, programming_language=ProgrammingLanguage.PYTHON) codebase = Codebase(projects=[project]) found_git = True break current_dir = os.path.dirname(current_dir) - + if not found_git: # Option C: Use from_repo method which handles cloning print("No local git repository found. Cloning a repository...") - codebase = Codebase.from_repo( - repo_full_name="codegen-sh/codegen", - language="python" - ) - + codebase = Codebase.from_repo(repo_full_name="codegen-sh/codegen", language="python") + print(f"Codebase loaded with {len(codebase.files)} files and {len(codebase.symbols)} symbols") - + # Run the analysis run(codebase) - + except Exception as e: print(f"\nāŒ Error: {str(e)}") print("\nTraceback:") traceback.print_exc() - sys.exit(1) \ No newline at end of file + sys.exit(1) diff --git a/codegen-examples/examples/attributions/symbol_attribution.py b/codegen-examples/examples/attributions/symbol_attribution.py index 16cd8179e..dc2512dd7 100644 --- a/codegen-examples/examples/attributions/symbol_attribution.py +++ b/codegen-examples/examples/attributions/symbol_attribution.py @@ -9,24 +9,25 @@ from codegen.sdk.codebase.config import ProjectConfig from codegen.shared.enums.programming_language import ProgrammingLanguage + def print_symbol_attribution(codebase): """Print attribution information for symbols in the codebase.""" print("\nšŸ” Symbol Attribution Examples:") - + # First, make sure attribution information is added to symbols - ai_authors = ['devin[bot]', 'codegen[bot]', 'github-actions[bot]'] + ai_authors = ["devin[bot]", "codegen[bot]", "github-actions[bot]"] add_attribution_to_symbols(codebase, ai_authors) - + # Get some interesting symbols to examine # Let's look at classes and functions with the most usages symbols_with_usages = [] for symbol in codebase.symbols: - if hasattr(symbol, 'usages') and len(symbol.usages) > 0: + if hasattr(symbol, "usages") and len(symbol.usages) > 0: symbols_with_usages.append((symbol, len(symbol.usages))) - + # Sort by usage count (most used first) symbols_with_usages.sort(key=lambda x: x[1], reverse=True) - + # Print attribution for top symbols count = 0 for symbol, usage_count in symbols_with_usages[:10]: # Look at top 10 most used symbols @@ -34,60 +35,58 @@ def print_symbol_attribution(codebase): print(f"\nšŸ“Š Symbol #{count}: {symbol.name} ({type(symbol).__name__})") print(f" • File: {symbol.filepath}") print(f" • Usages: {usage_count}") - + # Print attribution information - if hasattr(symbol, 'last_editor'): + if hasattr(symbol, "last_editor"): print(f" • Last editor: {symbol.last_editor}") else: print(" • Last editor: Not available") - - if hasattr(symbol, 'editor_history') and symbol.editor_history: - print(f" • Editor history: {', '.join(symbol.editor_history[:5])}" + - (f" and {len(symbol.editor_history) - 5} more..." if len(symbol.editor_history) > 5 else "")) + + if hasattr(symbol, "editor_history") and symbol.editor_history: + print(f" • Editor history: {', '.join(symbol.editor_history[:5])}" + (f" and {len(symbol.editor_history) - 5} more..." if len(symbol.editor_history) > 5 else "")) else: print(" • Editor history: Not available") - - if hasattr(symbol, 'is_ai_authored'): + + if hasattr(symbol, "is_ai_authored"): print(f" • AI authored: {'Yes' if symbol.is_ai_authored else 'No'}") else: print(" • AI authored: Not available") + if __name__ == "__main__": try: print("Initializing codebase...") - + # Use current directory if it's a git repository if os.path.exists(".git"): print("Using current directory as repository...") repo_path = os.getcwd() repo_config = RepoConfig.from_repo_path(repo_path) repo_operator = RepoOperator(repo_config=repo_config) - - project = ProjectConfig.from_repo_operator( - repo_operator=repo_operator, - programming_language=ProgrammingLanguage.PYTHON - ) + + project = ProjectConfig.from_repo_operator(repo_operator=repo_operator, programming_language=ProgrammingLanguage.PYTHON) codebase = Codebase(projects=[project]) else: # Use from_repo method for a well-known repository print("Using a sample repository...") codebase = Codebase.from_repo( repo_full_name="codegen-sh/codegen", - #commit="", # Using a specific commit for consistency - language="python" + # commit="", # Using a specific commit for consistency + language="python", ) - + print(f"Codebase loaded with {len(codebase.files)} files and {len(codebase.symbols)} symbols") - + # First run the analysis to gather attribution data print("\nšŸ” Running AI impact analysis...") run(codebase) - + # Then show examples of accessing attribution information print_symbol_attribution(codebase) - + except Exception as e: print(f"\nāŒ Error: {str(e)}") import traceback + traceback.print_exc() sys.exit(1) diff --git a/pyproject.toml b/pyproject.toml index d3ce75054..e2a7470d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ dependencies = [ "hatch-vcs>=0.4.0", "hatchling>=1.25.0", "pyinstrument>=5.0.0", - "pip>=24.3.1", # This is needed for some NPM/YARN/PNPM post-install scripts to work! + "pip>=24.3.1", # This is needed for some NPM/YARN/PNPM post-install scripts to work! "rich-click>=1.8.5", "python-dotenv>=1.0.1", "giturlparse", diff --git a/src/codegen/extensions/attribution/3pp/cursor.py b/src/codegen/extensions/attribution/3pp/cursor.py index 4fb534495..3d1f18046 100644 --- a/src/codegen/extensions/attribution/3pp/cursor.py +++ b/src/codegen/extensions/attribution/3pp/cursor.py @@ -577,9 +577,9 @@ async def main(): if search_results: print(f"Found {len(search_results)} results:") for i, result in enumerate(search_results[:10]): # Show first 10 results - print(f" {i+1}. [{result['type']}] {result['chatTitle']} ({result['workspaceName']})") + print(f" {i + 1}. [{result['type']}] {result['chatTitle']} ({result['workspaceName']})") # Show a snippet of the matching text - matching_text = result['matchingText'] + matching_text = result["matchingText"] if len(matching_text) > 100: matching_text = matching_text[:97] + "..." print(f" Match: {matching_text}") diff --git a/src/codegen/extensions/attribution/git_history.py b/src/codegen/extensions/attribution/git_history.py index fc0625c05..42ee0c40a 100644 --- a/src/codegen/extensions/attribution/git_history.py +++ b/src/codegen/extensions/attribution/git_history.py @@ -25,7 +25,7 @@ def __init__(self, codebase: Codebase, ai_authors: Optional[list[str]] = None): self.repo = pygit2.Repository(self.repo_path) # Default AI authors if none provided - self.ai_authors = ai_authors or ['devin[bot]', 'codegen[bot]'] + self.ai_authors = ai_authors or ["devin[bot]", "codegen[bot]"] # Cache structures self._file_history = {} # file path -> list of commit info @@ -101,11 +101,7 @@ def build_history(self, max_commits: Optional[int] = None) -> None: if len(self._author_contributions) > 0: print("Top contributors:") - top_contributors = sorted( - [(author, len(commits)) for author, commits in self._author_contributions.items()], - key=lambda x: x[1], - reverse=True - )[:5] + top_contributors = sorted([(author, len(commits)) for author, commits in self._author_contributions.items()], key=lambda x: x[1], reverse=True)[:5] for author, count in top_contributors: print(f" • {author}: {count} commits") else: @@ -123,11 +119,11 @@ def _process_commit(self, commit, diff) -> None: commit_id = str(commit.id) commit_info = { - 'author': author_name, - 'email': author_email, - 'timestamp': timestamp, - 'commit_id': commit_id, - 'message': commit.message.strip(), + "author": author_name, + "email": author_email, + "timestamp": timestamp, + "commit_id": commit_id, + "message": commit.message.strip(), } # Track by author @@ -145,7 +141,7 @@ def _process_commit(self, commit, diff) -> None: self._file_history[file_path] = [] file_commit = commit_info.copy() - file_commit['file_path'] = file_path + file_commit["file_path"] = file_path self._file_history[file_path].append(file_commit) def _is_tracked_file(self, file_path: str) -> bool: @@ -155,7 +151,7 @@ def _is_tracked_file(self, file_path: str) -> bool: # If we can't determine extensions, track common source files if not extensions: - extensions = ['.py', '.js', '.ts', '.tsx', '.jsx'] + extensions = [".py", ".js", ".ts", ".tsx", ".jsx"] return any(file_path.endswith(ext) for ext in extensions) @@ -173,7 +169,7 @@ def map_symbols_to_history(self) -> None: # For each symbol, find commits that modified its file for symbol in self.codebase.symbols: - if not hasattr(symbol, 'filepath') or not symbol.filepath: + if not hasattr(symbol, "filepath") or not symbol.filepath: continue symbol_id = f"{symbol.filepath}:{symbol.name}" @@ -201,7 +197,7 @@ def get_symbol_history(self, symbol: Symbol) -> list[dict]: """ self._ensure_history_built() - if not hasattr(symbol, 'filepath') or not symbol.filepath: + if not hasattr(symbol, "filepath") or not symbol.filepath: return [] symbol_id = f"{symbol.filepath}:{symbol.name}" @@ -221,8 +217,8 @@ def get_symbol_last_editor(self, symbol: Symbol) -> Optional[str]: return None # Sort by timestamp (newest first) and return the author - sorted_history = sorted(history, key=lambda x: x['timestamp'], reverse=True) - return sorted_history[0]['author'] + sorted_history = sorted(history, key=lambda x: x["timestamp"], reverse=True) + return sorted_history[0]["author"] def get_ai_contribution_stats(self) -> dict: """Get statistics about AI contributions to the codebase. @@ -239,7 +235,7 @@ def get_ai_contribution_stats(self) -> dict: for file_path, commits in self._file_history.items(): for commit in commits: total_file_commits[file_path] += 1 - if commit['author'] in self.ai_authors or commit['email'] in self.ai_authors: + if commit["author"] in self.ai_authors or commit["email"] in self.ai_authors: ai_file_commits[file_path] += 1 # Find files with highest AI contribution percentage @@ -249,17 +245,10 @@ def get_ai_contribution_stats(self) -> dict: ai_contribution_percentage[file_path] = (ai_file_commits[file_path] / total) * 100 # Get top files by AI contribution - top_ai_files = sorted( - ai_contribution_percentage.items(), - key=lambda x: x[1], - reverse=True - )[:20] + top_ai_files = sorted(ai_contribution_percentage.items(), key=lambda x: x[1], reverse=True)[:20] # Count total AI commits - ai_commits = sum( - len(commits) for author, commits in self._author_contributions.items() - if any(name in author for name in self.ai_authors) - ) + ai_commits = sum(len(commits) for author, commits in self._author_contributions.items() if any(name in author for name in self.ai_authors)) total_commits = sum(len(commits) for commits in self._author_contributions.values()) @@ -270,12 +259,12 @@ def get_ai_contribution_stats(self) -> dict: ai_percentage = 0.0 return { - 'total_commits': total_commits, - 'ai_commits': ai_commits, - 'ai_percentage': ai_percentage, - 'top_ai_files': top_ai_files, - 'ai_file_count': len([f for f, p in ai_contribution_percentage.items() if p > 50]), - 'total_file_count': len(total_file_commits), + "total_commits": total_commits, + "ai_commits": ai_commits, + "ai_percentage": ai_percentage, + "top_ai_files": top_ai_files, + "ai_file_count": len([f for f, p in ai_contribution_percentage.items() if p > 50]), + "total_file_count": len(total_file_commits), } def get_ai_touched_symbols(self) -> list[Symbol]: @@ -292,10 +281,7 @@ def get_ai_touched_symbols(self) -> list[Symbol]: history = self.get_symbol_history(symbol) # Check if any commit is from an AI author - if any( - commit['author'] in self.ai_authors or commit['email'] in self.ai_authors - for commit in history - ): + if any(commit["author"] in self.ai_authors or commit["email"] in self.ai_authors for commit in history): ai_symbols.append(symbol) return ai_symbols @@ -315,7 +301,7 @@ def get_ai_contribution_timeline(self) -> list[tuple[datetime, int]]: if any(name in author for name in self.ai_authors): for commit in commits: # Convert timestamp to year-month - dt = datetime.fromtimestamp(commit['timestamp']) + dt = datetime.fromtimestamp(commit["timestamp"]) month_key = f"{dt.year}-{dt.month:02d}" monthly_counts[month_key] += 1 diff --git a/src/codegen/extensions/attribution/main.py b/src/codegen/extensions/attribution/main.py index 5f81169ac..a282fda89 100644 --- a/src/codegen/extensions/attribution/main.py +++ b/src/codegen/extensions/attribution/main.py @@ -4,11 +4,7 @@ from codegen.sdk.core.codebase import Codebase -def analyze_ai_impact( - codebase: Codebase, - ai_authors: Optional[list[str]] = None, - max_commits: Optional[int] = None -) -> dict: +def analyze_ai_impact(codebase: Codebase, ai_authors: Optional[list[str]] = None, max_commits: Optional[int] = None) -> dict: """Analyze the impact of AI on a codebase. Args: @@ -33,23 +29,15 @@ def analyze_ai_impact( # Find high-impact AI symbols (those with many dependents) high_impact_symbols = [] for symbol in ai_symbols: - if hasattr(symbol, 'usages') and len(symbol.usages) > 5: - high_impact_symbols.append({ - 'name': symbol.name, - 'filepath': symbol.filepath, - 'usage_count': len(symbol.usages), - 'last_editor': tracker.get_symbol_last_editor(symbol) - }) + if hasattr(symbol, "usages") and len(symbol.usages) > 5: + high_impact_symbols.append({"name": symbol.name, "filepath": symbol.filepath, "usage_count": len(symbol.usages), "last_editor": tracker.get_symbol_last_editor(symbol)}) # Sort by usage count - high_impact_symbols.sort(key=lambda x: x['usage_count'], reverse=True) + high_impact_symbols.sort(key=lambda x: x["usage_count"], reverse=True) # Get timeline data timeline = tracker.get_ai_contribution_timeline() - timeline_data = [ - {'date': dt.strftime('%Y-%m'), 'count': count} - for dt, count in timeline - ] + timeline_data = [{"date": dt.strftime("%Y-%m"), "count": count} for dt, count in timeline] # Get list of all contributors with commit counts contributors = [] @@ -60,12 +48,12 @@ def analyze_ai_impact( contributors.sort(key=lambda x: x[1], reverse=True) return { - 'stats': stats, - 'ai_symbol_count': len(ai_symbols), - 'total_symbol_count': len(list(codebase.symbols)), - 'high_impact_symbols': high_impact_symbols[:20], # Top 20 - 'timeline': timeline_data, - 'contributors': contributors, + "stats": stats, + "ai_symbol_count": len(ai_symbols), + "total_symbol_count": len(list(codebase.symbols)), + "high_impact_symbols": high_impact_symbols[:20], # Top 20 + "timeline": timeline_data, + "contributors": contributors, } @@ -90,14 +78,12 @@ def add_attribution_to_symbols(codebase: Codebase, ai_authors: Optional[list[str # Add last editor if history: - sorted_history = sorted(history, key=lambda x: x['timestamp'], reverse=True) - symbol.last_editor = sorted_history[0]['author'] + sorted_history = sorted(history, key=lambda x: x["timestamp"], reverse=True) + symbol.last_editor = sorted_history[0]["author"] # Add editor history (unique editors) - editors = {commit['author'] for commit in history} + editors = {commit["author"] for commit in history} symbol.editor_history = list(editors) # Add is_ai_authored flag - symbol.is_ai_authored = any( - editor in tracker.ai_authors for editor in symbol.editor_history - ) + symbol.is_ai_authored = any(editor in tracker.ai_authors for editor in symbol.editor_history) From aca6587b822c5118b80515f2d98bac81930bdf1a Mon Sep 17 00:00:00 2001 From: tkucar Date: Tue, 4 Mar 2025 02:51:06 +0100 Subject: [PATCH 3/7] docs --- docs/mint.json | 3 +- docs/tutorials/attributions.mdx | 194 ++++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 docs/tutorials/attributions.mdx diff --git a/docs/mint.json b/docs/mint.json index d5062557c..fa74dfea4 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -105,7 +105,8 @@ "tutorials/python2-to-python3", "tutorials/flask-to-fastapi", "tutorials/build-mcp", - "tutorials/neo4j-graph" + "tutorials/neo4j-graph", + "tutorials/attributions" ] }, { diff --git a/docs/tutorials/attributions.mdx b/docs/tutorials/attributions.mdx new file mode 100644 index 000000000..cf86538f9 --- /dev/null +++ b/docs/tutorials/attributions.mdx @@ -0,0 +1,194 @@ +--- +title: "Code statistics and attributions" +sidebarTitle: "Code statistics and attributions" +description: "Learn how to analyze code statistics and attributions using Codegen" +icon: "network-wired" +iconType: "solid" +--- + +# AI Impact Analysis + +This tutorial shows how to use Codegen's attribution extension to analyze the impact of AI on your +codebase. You'll learn how to identify which parts of your code were written by AI tools like +GitHub Copilot, Devin, or other AI assistants. + +Note: the code is flexible - you can track CI pipeline bots, or any other contributor you want. + + +## Overview + +The attribution extension analyzes git history to: + +1. Identify which symbols (functions, classes, etc.) were authored or modified by AI tools +2. Calculate the percentage of AI contributions in your codebase +3. Find high-impact AI-written code (code that many other parts depend on) +4. Track the evolution of AI contributions over time + +## Installation + +The attribution extension is included with Codegen. No additional installation is required. + +## Basic Usage + +### Running the Analysis + +You can run the AI impact analysis using the Codegen CLI: + +```bash +codegen analyze-ai-impact +``` + +Or from Python code: + +```python +from codegen import Codebase +from codegen.extensions.attribution.cli import run + +# Initialize codebase from current directory +codebase = Codebase.from_repo("your-org/your-repo", language="python") + +# Run the analysis +run(codebase) +``` + +### Understanding the Results + +The analysis will print a summary of AI contributions to your console and save detailed results to a JSON file. The summary includes: + +- List of all contributors (human and AI) +- Percentage of commits made by AI +- Number of files and symbols touched by AI +- High-impact AI-written code (code with many dependents) +- Top files by AI contribution percentage + +## Advanced Usage + +### Accessing Attribution Information + +After running the analysis, each symbol in your codebase will have attribution information attached to it: + +```python +from codegen import Codebase +from codegen.extensions.attribution.main import add_attribution_to_symbols + +# Initialize codebase +codebase = Codebase.from_repo("your-org/your-repo", language="python") + +# Add attribution information to symbols +ai_authors = ['github-actions[bot]', 'dependabot[bot]', 'copilot[bot]'] +add_attribution_to_symbols(codebase, ai_authors) + +# Access attribution information on symbols +for symbol in codebase.symbols: + if hasattr(symbol, 'is_ai_authored') and symbol.is_ai_authored: + print(f"AI-authored symbol: {symbol.name} in {symbol.filepath}") + print(f"Last editor: {symbol.last_editor}") + print(f"All editors: {symbol.editor_history}") +``` + +### Customizing AI Author Detection + +By default, the analysis looks for common AI bot names in commit authors. +You can customize this by providing your own list of AI authors: + +```python +from codegen import Codebase +from codegen.extensions.attribution.main import analyze_ai_impact + +# Initialize codebase +codebase = Codebase.from_repo("your-org/your-repo", language="python") + +# Define custom AI authors +ai_authors = [ + 'github-actions[bot]', + 'dependabot[bot]', + 'copilot[bot]', + 'devin[bot]', + 'your-custom-ai-email@example.com' +] + +# Run analysis with custom AI authors +results = analyze_ai_impact(codebase, ai_authors) +``` + +## Example: Contributor Analysis + +Here's a complete example that analyzes contributors to your codebase and their impact: + +```python +import os +from collections import Counter + +from codegen import Codebase +from codegen.extensions.attribution.main import add_attribution_to_symbols +from codegen.git.repo_operator.repo_operator import RepoOperator +from codegen.git.schemas.repo_config import RepoConfig +from codegen.sdk.codebase.config import ProjectConfig +from codegen.shared.enums.programming_language import ProgrammingLanguage + +def analyze_contributors(codebase): + """Analyze contributors to the codebase and their impact.""" + print("\nšŸ” Contributor Analysis:") + + # Define which authors are considered AI + ai_authors = ['devin[bot]', 'codegen[bot]', 'github-actions[bot]', 'dependabot[bot]'] + + # Add attribution information to all symbols + print("Adding attribution information to symbols...") + add_attribution_to_symbols(codebase, ai_authors) + + # Collect statistics about contributors + contributor_stats = Counter() + ai_contributor_stats = Counter() + + print("Analyzing symbol attributions...") + for symbol in codebase.symbols: + if hasattr(symbol, 'last_editor') and symbol.last_editor: + contributor_stats[symbol.last_editor] += 1 + + # Track if this is an AI contributor + if any(ai in symbol.last_editor for ai in ai_authors): + ai_contributor_stats[symbol.last_editor] += 1 + + # Print top contributors overall + print("\nšŸ‘„ Top Contributors by Symbols Authored:") + for contributor, count in contributor_stats.most_common(10): + is_ai = any(ai in contributor for ai in ai_authors) + ai_indicator = "šŸ¤–" if is_ai else "šŸ‘¤" + print(f" {ai_indicator} {contributor}: {count} symbols") + + # Print top AI contributors if any + if ai_contributor_stats: + print("\nšŸ¤– Top AI Contributors:") + for contributor, count in ai_contributor_stats.most_common(5): + print(f" • {contributor}: {count} symbols") + +# Initialize codebase from current directory +if os.path.exists(".git"): + repo_path = os.getcwd() + repo_config = RepoConfig.from_repo_path(repo_path) + repo_operator = RepoOperator(repo_config=repo_config) + + project = ProjectConfig.from_repo_operator( + repo_operator=repo_operator, + programming_language=ProgrammingLanguage.PYTHON + ) + codebase = Codebase(projects=[project]) + + # Run the contributor analysis + analyze_contributors(codebase) +``` + +## Conclusion + +The attribution extension provides valuable insights into how AI tools are being used in your +development process. By understanding which parts of your codebase are authored by AI, you can: + +- Track the adoption of AI coding assistants in your team +- Identify areas where AI is most effective +- Ensure appropriate review of AI-generated code +- Measure the impact of AI on developer productivity + +For more advanced usage, check out the [API reference](/api-reference/extensions/attribution) +for the attribution extension. + From 9d308701712243e0fb657312a7a808e7d0743707 Mon Sep 17 00:00:00 2001 From: tomcodgen <191515280+tomcodgen@users.noreply.github.com> Date: Tue, 4 Mar 2025 01:52:02 +0000 Subject: [PATCH 4/7] Automated pre-commit update --- docs/mint.json | 756 ++++++++++++++++++++++++------------------------- 1 file changed, 377 insertions(+), 379 deletions(-) diff --git a/docs/mint.json b/docs/mint.json index fa74dfea4..435da0a76 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -1,380 +1,378 @@ { - "$schema": "https://mintlify.com/schema.json", - "name": "Codegen", - "logo": { - "dark": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45a3e32761c42b324b_Codegen_Logomark_Dark.svg", - "light": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45bf55446746125835_Codegen_Logomark_Light.svg" - }, - "modeToggle": { - "default": "dark" - }, - "metadata": { - "og:site_name": "Codegen", - "og:title": "Codegen - Manipulate Code at Scale", - "og:description": "A scriptable interface to a powerful, multi-lingual language server built on top of Tree-sitter.", - "og:url": "https://docs.codegen.com", - "og:locale": "en_US", - "og:logo": "https://i.imgur.com/f4OVOqI.png", - "article:publisher": "Codegen, Inc.", - "twitter:site": "@codegen" - }, - "favicon": "/favicon.svg", - "colors": { - "primary": "#a277ff", - "light": "#a277ff", - "dark": "#a277ff", - "anchors": { - "from": "#61ffca", - "to": "#61ffca" - } - }, - "theme": "prism", - "background": { - "style": "gradient" - }, - "analytics": { - "posthog": { - "apiKey": "phc_GLxaINoQJnuyCyxDmTciQqzdKBYFVDkY7bRBO4bDdso" - } - }, - "feedback": { - "thumbsRating": true - }, - "topbarCtaButton": { - "name": "GitHub", - "url": "https://github.com/codegen-sh/codegen-sdk" - }, - "tabs": [ - { - "name": "API Reference", - "url": "/api-reference" - }, - { - "name": "CLI", - "url": "/cli" - }, - { - "name": "Blog", - "url": "/blog" - }, - { - "name": "Changelog", - "url": "/changelog" - } - ], - "navigation": [ - { - "group": "Introduction", - "pages": [ - "introduction/overview", - "introduction/getting-started", - "introduction/installation", - "introduction/ide-usage", - "introduction/work-with-ai", - "introduction/how-it-works", - "introduction/guiding-principles", - "introduction/community", - "introduction/about", - "introduction/faq" - ] - }, - { - "group": "Tutorials", - "pages": [ - "tutorials/at-a-glance", - "tutorials/build-code-agent", - "tutorials/slack-bot", - "tutorials/github-review-bot", - "tutorials/deep-code-research", - "tutorials/training-data", - "tutorials/codebase-visualization", - "tutorials/migrating-apis", - "tutorials/organize-your-codebase", - "tutorials/promise-to-async-await", - "tutorials/modularity", - "tutorials/manage-feature-flags", - "tutorials/deleting-dead-code", - "tutorials/increase-type-coverage", - "tutorials/managing-typescript-exports", - "tutorials/converting-default-exports", - "tutorials/creating-documentation", - "tutorials/react-modernization", - "tutorials/unittest-to-pytest", - "tutorials/sqlalchemy-1.6-to-2.0", - "tutorials/fixing-import-loops-in-pytorch", - "tutorials/python2-to-python3", - "tutorials/flask-to-fastapi", - "tutorials/build-mcp", - "tutorials/neo4j-graph", - "tutorials/attributions" - ] - }, - { - "group": "Building with Codegen", - "pages": [ - "building-with-codegen/at-a-glance", - "building-with-codegen/parsing-codebases", - "building-with-codegen/reusable-codemods", - "building-with-codegen/dot-codegen", - "building-with-codegen/function-decorator", - "building-with-codegen/language-support", - "building-with-codegen/commit-and-reset", - "building-with-codegen/git-operations", - "building-with-codegen/files-and-directories", - "building-with-codegen/the-editable-api", - "building-with-codegen/symbol-api", - "building-with-codegen/class-api", - "building-with-codegen/imports", - "building-with-codegen/exports", - "building-with-codegen/inheritable-behaviors", - "building-with-codegen/statements-and-code-blocks", - "building-with-codegen/dependencies-and-usages", - "building-with-codegen/function-calls-and-callsites", - "building-with-codegen/variable-assignments", - "building-with-codegen/local-variables", - "building-with-codegen/comments-and-docstrings", - "building-with-codegen/external-modules", - "building-with-codegen/type-annotations", - "building-with-codegen/moving-symbols", - "building-with-codegen/collections", - "building-with-codegen/traversing-the-call-graph", - "building-with-codegen/react-and-jsx", - "building-with-codegen/codebase-visualization", - "building-with-codegen/flagging-symbols", - "building-with-codegen/calling-out-to-llms", - "building-with-codegen/semantic-code-search", - "building-with-codegen/reducing-conditions" - ] - }, - { - "group": "CLI", - "pages": [ - "cli/about", - "cli/init", - "cli/notebook", - "cli/create", - "cli/run", - "cli/reset", - "cli/expert" - ] - }, - { - "group": "Changelog", - "pages": [ - "changelog/changelog" - ] - }, - { - "group": "Blog", - "pages": [ - "blog/posts", - "blog/act-via-code", - "blog/promise-to-async-await-twilio", - "blog/fixing-import-loops" - ] - }, - { - "group": "API Reference", - "pages": [ - "api-reference/index", - { - "group": "Core", - "icon": "code", - "pages": [ - "api-reference/core/Argument", - "api-reference/core/Assignment", - "api-reference/core/AssignmentStatement", - "api-reference/core/Attribute", - "api-reference/core/AwaitExpression", - "api-reference/core/BinaryExpression", - "api-reference/core/BlockStatement", - "api-reference/core/Boolean", - "api-reference/core/Callable", - "api-reference/core/CatchStatement", - "api-reference/core/ChainedAttribute", - "api-reference/core/Class", - "api-reference/core/CodeBlock", - "api-reference/core/CodeOwner", - "api-reference/core/Codebase", - "api-reference/core/Comment", - "api-reference/core/CommentGroup", - "api-reference/core/ComparisonExpression", - "api-reference/core/Decorator", - "api-reference/core/Dict", - "api-reference/core/Directory", - "api-reference/core/Editable", - "api-reference/core/Export", - "api-reference/core/ExportStatement", - "api-reference/core/Exportable", - "api-reference/core/Expression", - "api-reference/core/ExpressionGroup", - "api-reference/core/ExpressionStatement", - "api-reference/core/ExternalModule", - "api-reference/core/File", - "api-reference/core/FlagKwargs", - "api-reference/core/ForLoopStatement", - "api-reference/core/Function", - "api-reference/core/FunctionCall", - "api-reference/core/GenericType", - "api-reference/core/HasBlock", - "api-reference/core/HasName", - "api-reference/core/HasValue", - "api-reference/core/IfBlockStatement", - "api-reference/core/Import", - "api-reference/core/ImportStatement", - "api-reference/core/ImportType", - "api-reference/core/Importable", - "api-reference/core/Interface", - "api-reference/core/List", - "api-reference/core/MessageType", - "api-reference/core/MultiExpression", - "api-reference/core/MultiLineCollection", - "api-reference/core/Name", - "api-reference/core/NamedType", - "api-reference/core/NoneType", - "api-reference/core/Number", - "api-reference/core/Pair", - "api-reference/core/Parameter", - "api-reference/core/ParenthesizedExpression", - "api-reference/core/Placeholder", - "api-reference/core/PlaceholderType", - "api-reference/core/RaiseStatement", - "api-reference/core/ReturnStatement", - "api-reference/core/SourceFile", - "api-reference/core/Span", - "api-reference/core/Statement", - "api-reference/core/StatementType", - "api-reference/core/String", - "api-reference/core/StubPlaceholder", - "api-reference/core/SubscriptExpression", - "api-reference/core/SwitchCase", - "api-reference/core/SwitchStatement", - "api-reference/core/Symbol", - "api-reference/core/SymbolGroup", - "api-reference/core/SymbolStatement", - "api-reference/core/TernaryExpression", - "api-reference/core/TryCatchStatement", - "api-reference/core/Tuple", - "api-reference/core/TupleType", - "api-reference/core/Type", - "api-reference/core/TypeAlias", - "api-reference/core/TypePlaceholder", - "api-reference/core/Typeable", - "api-reference/core/UnaryExpression", - "api-reference/core/UnionType", - "api-reference/core/Unpack", - "api-reference/core/Unwrappable", - "api-reference/core/Usable", - "api-reference/core/Usage", - "api-reference/core/UsageKind", - "api-reference/core/UsageType", - "api-reference/core/Value", - "api-reference/core/WhileStatement", - "api-reference/core/WithStatement" - ] - }, - { - "group": "Python", - "icon": "python", - "pages": [ - "api-reference/python/PyAssignment", - "api-reference/python/PyAssignmentStatement", - "api-reference/python/PyAttribute", - "api-reference/python/PyBlockStatement", - "api-reference/python/PyBreakStatement", - "api-reference/python/PyCatchStatement", - "api-reference/python/PyChainedAttribute", - "api-reference/python/PyClass", - "api-reference/python/PyCodeBlock", - "api-reference/python/PyComment", - "api-reference/python/PyCommentGroup", - "api-reference/python/PyCommentType", - "api-reference/python/PyConditionalExpression", - "api-reference/python/PyDecorator", - "api-reference/python/PyFile", - "api-reference/python/PyForLoopStatement", - "api-reference/python/PyFunction", - "api-reference/python/PyGenericType", - "api-reference/python/PyHasBlock", - "api-reference/python/PyIfBlockStatement", - "api-reference/python/PyImport", - "api-reference/python/PyImportStatement", - "api-reference/python/PyMatchCase", - "api-reference/python/PyMatchStatement", - "api-reference/python/PyNamedType", - "api-reference/python/PyParameter", - "api-reference/python/PyPassStatement", - "api-reference/python/PyReturnTypePlaceholder", - "api-reference/python/PyString", - "api-reference/python/PySymbol", - "api-reference/python/PyTryCatchStatement", - "api-reference/python/PyUnionType", - "api-reference/python/PyWhileStatement" - ] - }, - { - "group": "Typescript", - "icon": "js", - "pages": [ - "api-reference/typescript/JSXElement", - "api-reference/typescript/JSXExpression", - "api-reference/typescript/JSXProp", - "api-reference/typescript/TSArrayType", - "api-reference/typescript/TSAssignment", - "api-reference/typescript/TSAssignmentStatement", - "api-reference/typescript/TSAttribute", - "api-reference/typescript/TSBlockStatement", - "api-reference/typescript/TSCatchStatement", - "api-reference/typescript/TSChainedAttribute", - "api-reference/typescript/TSClass", - "api-reference/typescript/TSCodeBlock", - "api-reference/typescript/TSComment", - "api-reference/typescript/TSCommentGroup", - "api-reference/typescript/TSCommentType", - "api-reference/typescript/TSConditionalType", - "api-reference/typescript/TSConfig", - "api-reference/typescript/TSDecorator", - "api-reference/typescript/TSDict", - "api-reference/typescript/TSEnum", - "api-reference/typescript/TSExport", - "api-reference/typescript/TSExpressionType", - "api-reference/typescript/TSFile", - "api-reference/typescript/TSForLoopStatement", - "api-reference/typescript/TSFunction", - "api-reference/typescript/TSFunctionType", - "api-reference/typescript/TSGenericType", - "api-reference/typescript/TSHasBlock", - "api-reference/typescript/TSIfBlockStatement", - "api-reference/typescript/TSImport", - "api-reference/typescript/TSImportStatement", - "api-reference/typescript/TSInterface", - "api-reference/typescript/TSLabeledStatement", - "api-reference/typescript/TSLookupType", - "api-reference/typescript/TSNamedType", - "api-reference/typescript/TSNamespace", - "api-reference/typescript/TSObjectType", - "api-reference/typescript/TSPair", - "api-reference/typescript/TSParameter", - "api-reference/typescript/TSQueryType", - "api-reference/typescript/TSReadonlyType", - "api-reference/typescript/TSReturnTypePlaceholder", - "api-reference/typescript/TSString", - "api-reference/typescript/TSSwitchCase", - "api-reference/typescript/TSSwitchStatement", - "api-reference/typescript/TSSymbol", - "api-reference/typescript/TSTernaryExpression", - "api-reference/typescript/TSTryCatchStatement", - "api-reference/typescript/TSTypeAlias", - "api-reference/typescript/TSUndefinedType", - "api-reference/typescript/TSUnionType", - "api-reference/typescript/TSWhileStatement" - ] - } - ] - } - ], - "footerSocials": { - "x": "https://x.com/codegen", - "linkedin": "https://linkedin.com/company/codegen-dot-com" - } -} \ No newline at end of file + "$schema": "https://mintlify.com/schema.json", + "name": "Codegen", + "logo": { + "dark": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45a3e32761c42b324b_Codegen_Logomark_Dark.svg", + "light": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45bf55446746125835_Codegen_Logomark_Light.svg" + }, + "modeToggle": { + "default": "dark" + }, + "metadata": { + "og:site_name": "Codegen", + "og:title": "Codegen - Manipulate Code at Scale", + "og:description": "A scriptable interface to a powerful, multi-lingual language server built on top of Tree-sitter.", + "og:url": "https://docs.codegen.com", + "og:locale": "en_US", + "og:logo": "https://i.imgur.com/f4OVOqI.png", + "article:publisher": "Codegen, Inc.", + "twitter:site": "@codegen" + }, + "favicon": "/favicon.svg", + "colors": { + "primary": "#a277ff", + "light": "#a277ff", + "dark": "#a277ff", + "anchors": { + "from": "#61ffca", + "to": "#61ffca" + } + }, + "theme": "prism", + "background": { + "style": "gradient" + }, + "analytics": { + "posthog": { + "apiKey": "phc_GLxaINoQJnuyCyxDmTciQqzdKBYFVDkY7bRBO4bDdso" + } + }, + "feedback": { + "thumbsRating": true + }, + "topbarCtaButton": { + "name": "GitHub", + "url": "https://github.com/codegen-sh/codegen-sdk" + }, + "tabs": [ + { + "name": "API Reference", + "url": "/api-reference" + }, + { + "name": "CLI", + "url": "/cli" + }, + { + "name": "Blog", + "url": "/blog" + }, + { + "name": "Changelog", + "url": "/changelog" + } + ], + "navigation": [ + { + "group": "Introduction", + "pages": [ + "introduction/overview", + "introduction/getting-started", + "introduction/installation", + "introduction/ide-usage", + "introduction/work-with-ai", + "introduction/how-it-works", + "introduction/guiding-principles", + "introduction/community", + "introduction/about", + "introduction/faq" + ] + }, + { + "group": "Tutorials", + "pages": [ + "tutorials/at-a-glance", + "tutorials/build-code-agent", + "tutorials/slack-bot", + "tutorials/github-review-bot", + "tutorials/deep-code-research", + "tutorials/training-data", + "tutorials/codebase-visualization", + "tutorials/migrating-apis", + "tutorials/organize-your-codebase", + "tutorials/promise-to-async-await", + "tutorials/modularity", + "tutorials/manage-feature-flags", + "tutorials/deleting-dead-code", + "tutorials/increase-type-coverage", + "tutorials/managing-typescript-exports", + "tutorials/converting-default-exports", + "tutorials/creating-documentation", + "tutorials/react-modernization", + "tutorials/unittest-to-pytest", + "tutorials/sqlalchemy-1.6-to-2.0", + "tutorials/fixing-import-loops-in-pytorch", + "tutorials/python2-to-python3", + "tutorials/flask-to-fastapi", + "tutorials/build-mcp", + "tutorials/neo4j-graph", + "tutorials/attributions" + ] + }, + { + "group": "Building with Codegen", + "pages": [ + "building-with-codegen/at-a-glance", + "building-with-codegen/parsing-codebases", + "building-with-codegen/reusable-codemods", + "building-with-codegen/dot-codegen", + "building-with-codegen/function-decorator", + "building-with-codegen/language-support", + "building-with-codegen/commit-and-reset", + "building-with-codegen/git-operations", + "building-with-codegen/files-and-directories", + "building-with-codegen/the-editable-api", + "building-with-codegen/symbol-api", + "building-with-codegen/class-api", + "building-with-codegen/imports", + "building-with-codegen/exports", + "building-with-codegen/inheritable-behaviors", + "building-with-codegen/statements-and-code-blocks", + "building-with-codegen/dependencies-and-usages", + "building-with-codegen/function-calls-and-callsites", + "building-with-codegen/variable-assignments", + "building-with-codegen/local-variables", + "building-with-codegen/comments-and-docstrings", + "building-with-codegen/external-modules", + "building-with-codegen/type-annotations", + "building-with-codegen/moving-symbols", + "building-with-codegen/collections", + "building-with-codegen/traversing-the-call-graph", + "building-with-codegen/react-and-jsx", + "building-with-codegen/codebase-visualization", + "building-with-codegen/flagging-symbols", + "building-with-codegen/calling-out-to-llms", + "building-with-codegen/semantic-code-search", + "building-with-codegen/reducing-conditions" + ] + }, + { + "group": "CLI", + "pages": [ + "cli/about", + "cli/init", + "cli/notebook", + "cli/create", + "cli/run", + "cli/reset", + "cli/expert" + ] + }, + { + "group": "Changelog", + "pages": ["changelog/changelog"] + }, + { + "group": "Blog", + "pages": [ + "blog/posts", + "blog/act-via-code", + "blog/promise-to-async-await-twilio", + "blog/fixing-import-loops" + ] + }, + { + "group": "API Reference", + "pages": [ + "api-reference/index", + { + "group": "Core", + "icon": "code", + "pages": [ + "api-reference/core/Argument", + "api-reference/core/Assignment", + "api-reference/core/AssignmentStatement", + "api-reference/core/Attribute", + "api-reference/core/AwaitExpression", + "api-reference/core/BinaryExpression", + "api-reference/core/BlockStatement", + "api-reference/core/Boolean", + "api-reference/core/Callable", + "api-reference/core/CatchStatement", + "api-reference/core/ChainedAttribute", + "api-reference/core/Class", + "api-reference/core/CodeBlock", + "api-reference/core/CodeOwner", + "api-reference/core/Codebase", + "api-reference/core/Comment", + "api-reference/core/CommentGroup", + "api-reference/core/ComparisonExpression", + "api-reference/core/Decorator", + "api-reference/core/Dict", + "api-reference/core/Directory", + "api-reference/core/Editable", + "api-reference/core/Export", + "api-reference/core/ExportStatement", + "api-reference/core/Exportable", + "api-reference/core/Expression", + "api-reference/core/ExpressionGroup", + "api-reference/core/ExpressionStatement", + "api-reference/core/ExternalModule", + "api-reference/core/File", + "api-reference/core/FlagKwargs", + "api-reference/core/ForLoopStatement", + "api-reference/core/Function", + "api-reference/core/FunctionCall", + "api-reference/core/GenericType", + "api-reference/core/HasBlock", + "api-reference/core/HasName", + "api-reference/core/HasValue", + "api-reference/core/IfBlockStatement", + "api-reference/core/Import", + "api-reference/core/ImportStatement", + "api-reference/core/ImportType", + "api-reference/core/Importable", + "api-reference/core/Interface", + "api-reference/core/List", + "api-reference/core/MessageType", + "api-reference/core/MultiExpression", + "api-reference/core/MultiLineCollection", + "api-reference/core/Name", + "api-reference/core/NamedType", + "api-reference/core/NoneType", + "api-reference/core/Number", + "api-reference/core/Pair", + "api-reference/core/Parameter", + "api-reference/core/ParenthesizedExpression", + "api-reference/core/Placeholder", + "api-reference/core/PlaceholderType", + "api-reference/core/RaiseStatement", + "api-reference/core/ReturnStatement", + "api-reference/core/SourceFile", + "api-reference/core/Span", + "api-reference/core/Statement", + "api-reference/core/StatementType", + "api-reference/core/String", + "api-reference/core/StubPlaceholder", + "api-reference/core/SubscriptExpression", + "api-reference/core/SwitchCase", + "api-reference/core/SwitchStatement", + "api-reference/core/Symbol", + "api-reference/core/SymbolGroup", + "api-reference/core/SymbolStatement", + "api-reference/core/TernaryExpression", + "api-reference/core/TryCatchStatement", + "api-reference/core/Tuple", + "api-reference/core/TupleType", + "api-reference/core/Type", + "api-reference/core/TypeAlias", + "api-reference/core/TypePlaceholder", + "api-reference/core/Typeable", + "api-reference/core/UnaryExpression", + "api-reference/core/UnionType", + "api-reference/core/Unpack", + "api-reference/core/Unwrappable", + "api-reference/core/Usable", + "api-reference/core/Usage", + "api-reference/core/UsageKind", + "api-reference/core/UsageType", + "api-reference/core/Value", + "api-reference/core/WhileStatement", + "api-reference/core/WithStatement" + ] + }, + { + "group": "Python", + "icon": "python", + "pages": [ + "api-reference/python/PyAssignment", + "api-reference/python/PyAssignmentStatement", + "api-reference/python/PyAttribute", + "api-reference/python/PyBlockStatement", + "api-reference/python/PyBreakStatement", + "api-reference/python/PyCatchStatement", + "api-reference/python/PyChainedAttribute", + "api-reference/python/PyClass", + "api-reference/python/PyCodeBlock", + "api-reference/python/PyComment", + "api-reference/python/PyCommentGroup", + "api-reference/python/PyCommentType", + "api-reference/python/PyConditionalExpression", + "api-reference/python/PyDecorator", + "api-reference/python/PyFile", + "api-reference/python/PyForLoopStatement", + "api-reference/python/PyFunction", + "api-reference/python/PyGenericType", + "api-reference/python/PyHasBlock", + "api-reference/python/PyIfBlockStatement", + "api-reference/python/PyImport", + "api-reference/python/PyImportStatement", + "api-reference/python/PyMatchCase", + "api-reference/python/PyMatchStatement", + "api-reference/python/PyNamedType", + "api-reference/python/PyParameter", + "api-reference/python/PyPassStatement", + "api-reference/python/PyReturnTypePlaceholder", + "api-reference/python/PyString", + "api-reference/python/PySymbol", + "api-reference/python/PyTryCatchStatement", + "api-reference/python/PyUnionType", + "api-reference/python/PyWhileStatement" + ] + }, + { + "group": "Typescript", + "icon": "js", + "pages": [ + "api-reference/typescript/JSXElement", + "api-reference/typescript/JSXExpression", + "api-reference/typescript/JSXProp", + "api-reference/typescript/TSArrayType", + "api-reference/typescript/TSAssignment", + "api-reference/typescript/TSAssignmentStatement", + "api-reference/typescript/TSAttribute", + "api-reference/typescript/TSBlockStatement", + "api-reference/typescript/TSCatchStatement", + "api-reference/typescript/TSChainedAttribute", + "api-reference/typescript/TSClass", + "api-reference/typescript/TSCodeBlock", + "api-reference/typescript/TSComment", + "api-reference/typescript/TSCommentGroup", + "api-reference/typescript/TSCommentType", + "api-reference/typescript/TSConditionalType", + "api-reference/typescript/TSConfig", + "api-reference/typescript/TSDecorator", + "api-reference/typescript/TSDict", + "api-reference/typescript/TSEnum", + "api-reference/typescript/TSExport", + "api-reference/typescript/TSExpressionType", + "api-reference/typescript/TSFile", + "api-reference/typescript/TSForLoopStatement", + "api-reference/typescript/TSFunction", + "api-reference/typescript/TSFunctionType", + "api-reference/typescript/TSGenericType", + "api-reference/typescript/TSHasBlock", + "api-reference/typescript/TSIfBlockStatement", + "api-reference/typescript/TSImport", + "api-reference/typescript/TSImportStatement", + "api-reference/typescript/TSInterface", + "api-reference/typescript/TSLabeledStatement", + "api-reference/typescript/TSLookupType", + "api-reference/typescript/TSNamedType", + "api-reference/typescript/TSNamespace", + "api-reference/typescript/TSObjectType", + "api-reference/typescript/TSPair", + "api-reference/typescript/TSParameter", + "api-reference/typescript/TSQueryType", + "api-reference/typescript/TSReadonlyType", + "api-reference/typescript/TSReturnTypePlaceholder", + "api-reference/typescript/TSString", + "api-reference/typescript/TSSwitchCase", + "api-reference/typescript/TSSwitchStatement", + "api-reference/typescript/TSSymbol", + "api-reference/typescript/TSTernaryExpression", + "api-reference/typescript/TSTryCatchStatement", + "api-reference/typescript/TSTypeAlias", + "api-reference/typescript/TSUndefinedType", + "api-reference/typescript/TSUnionType", + "api-reference/typescript/TSWhileStatement" + ] + } + ] + } + ], + "footerSocials": { + "x": "https://x.com/codegen", + "linkedin": "https://linkedin.com/company/codegen-dot-com" + } +} From 56fbca6491cc80c6627f9340a3d7f37dc2b53f40 Mon Sep 17 00:00:00 2001 From: tkucar Date: Fri, 7 Mar 2025 20:10:25 +0100 Subject: [PATCH 5/7] warning: this drops changes that are not commited! to be addressed in next revision --- .../extensions/attribution/git_history.py | 140 +++++++++++++++--- 1 file changed, 119 insertions(+), 21 deletions(-) diff --git a/src/codegen/extensions/attribution/git_history.py b/src/codegen/extensions/attribution/git_history.py index 42ee0c40a..c66fa5f07 100644 --- a/src/codegen/extensions/attribution/git_history.py +++ b/src/codegen/extensions/attribution/git_history.py @@ -1,11 +1,15 @@ import time -from collections import defaultdict +from collections import defaultdict, deque from datetime import datetime from typing import Optional import pygit2 +from intervaltree import IntervalTree +from pygit2 import Commit, Patch +from pygit2.enums import CheckoutStrategy, DeltaStatus, SortMode from codegen.sdk.core.codebase import Codebase +from codegen.sdk.core.file import SourceFile from codegen.sdk.core.symbol import Symbol @@ -23,18 +27,23 @@ def __init__(self, codebase: Codebase, ai_authors: Optional[list[str]] = None): self.codebase = codebase self.repo_path = codebase.ctx.projects[0].repo_operator.repo_path self.repo = pygit2.Repository(self.repo_path) + self.org_branch_reference = self.repo.head # Default AI authors if none provided self.ai_authors = ai_authors or ["devin[bot]", "codegen[bot]"] # Cache structures self._file_history = {} # file path -> list of commit info - self._symbol_history = {} # symbol id -> list of commit info + self._symbol_history:defaultdict[str,list] = defaultdict(list) # symbol id -> list of commit info self._author_contributions = defaultdict(list) # author -> list of commit info # Track if history has been built self._history_built = False + self._file_symbol_location_state:dict[str,IntervalTree] = {} + + self._commits:deque[Commit] + def build_history(self, max_commits: Optional[int] = None) -> None: """Build the git history for the codebase. @@ -57,12 +66,13 @@ def build_history(self, max_commits: Optional[int] = None) -> None: commit_count = 0 author_set = set() + self._commits=deque() try: - for commit in self.repo.walk(self.repo.head.target, pygit2.GIT_SORT_TIME): + for commit in self.repo.walk(self.repo.head.target, SortMode.TIME): # Track unique authors author_id = f"{commit.author.name} <{commit.author.email}>" author_set.add(author_id) - + self._commits.append(commit) # Process each diff in the commit if len(commit.parents) > 0: try: @@ -144,6 +154,35 @@ def _process_commit(self, commit, diff) -> None: file_commit["file_path"] = file_path self._file_history[file_path].append(file_commit) + + def _process_symbol_location_state(self, filepaths:list[str]): + for filepath in filepaths: + file = self.codebase.get_file(filepath) + filetree = IntervalTree() + try: + for symbol in file.symbols: + symbol:Symbol + start_line=symbol.range.start_point.row+1 # 1 Indexing + end_line=symbol.range.end_point.row+2 # Intervaltree is end non-inclusive + filetree.addi(start_line,end_line,symbol) + except Exception as e: + pass + self._file_symbol_location_state[filepath] = filetree + + def _get_symbols_affected_by_patch(self,patch:Patch,filepath): + if filepath not in self._file_symbol_location_state: + return [] + symbols_affected=set() + for hunk in patch.hunks: + start = hunk.new_start + end = start+hunk.new_lines # Intervaltree is end non-inclusive + for interval in self._file_symbol_location_state[filepath].overlap(start,end): + symbols_affected.add(interval[2]) + + return symbols_affected + + + def _is_tracked_file(self, file_path: str) -> bool: """Check if a file should be tracked based on extension.""" # Get file extensions from the codebase @@ -160,31 +199,90 @@ def _ensure_history_built(self) -> None: if not self._history_built: self.build_history() - def map_symbols_to_history(self) -> None: - """Map symbols in the codebase to their git history.""" + def map_symbols_to_history(self,force=False) -> None: + """Map symbols in the codebase to their git history. force ensures a rerun even if data is already found!""" self._ensure_history_built() + if self._symbol_history: + print("Already built, run with force if you want to rerun anyway!") + return print("Mapping symbols to git history...") start_time = time.time() - # For each symbol, find commits that modified its file - for symbol in self.codebase.symbols: - if not hasattr(symbol, "filepath") or not symbol.filepath: - continue - - symbol_id = f"{symbol.filepath}:{symbol.name}" - self._symbol_history[symbol_id] = [] - - # Get file history - file_history = self._file_history.get(symbol.filepath, []) + print("Turning off graph mapping!") - # For now, just associate all file changes with the symbol - # A more sophisticated approach would use line ranges - for commit in file_history: - self._symbol_history[symbol_id].append(commit) + print("Generating initial symbol state...") + filepaths = [file.filepath for file in self.codebase.files] + self._process_symbol_location_state(filepaths) elapsed = time.time() - start_time - print(f"Finished mapping symbols in {elapsed:.2f} seconds.") + print(f"Finished initial symbol state generation in {elapsed:.2f} seconds.") + symbol_tracking_checkpoint=time.time() + try: + print("Starting symbol tracking procedure....") + for commit in self._commits: + author_name = commit.author.name + author_email = commit.author.email + timestamp = commit.author.time + commit_id = str(commit.id) + + commit_info = { + "author": author_name, + "email": author_email, + "timestamp": timestamp, + "commit_id": commit_id, + "message": commit.message.strip(), + } + commit_previous = commit.parents[0] if commit.parents else None + if not commit_previous: + #If Last commit + empty_tree_old = self.repo.TreeBuilder().write() + empty_tree=self.repo.get(empty_tree_old) + diff = self.repo.diff(empty_tree,commit.tree) + else: + diff = self.repo.diff(commit_previous, commit,context_lines=0) #We don't need context lines + + if isinstance(diff,Patch): + diff=[diff] + sync_past_filepaths=[] #Files to sync in the past commit + for patch in diff: + filepath=patch.delta.new_file.path + if not self._is_tracked_file(filepath): + continue #Ignore files we don't track + if not patch.delta.status==DeltaStatus.ADDED: #Reversed since we're going backwards, if it doesn't exist in the past commits don't sync! + sync_past_filepaths.append(filepath) + symbols_affected = self._get_symbols_affected_by_patch(patch,filepath) + for symbol in symbols_affected: + symbol_id = f"{symbol.filepath}:{symbol.name}" #For future stuff might want to do this more neatly and allow for future dead symbols/renames + self._symbol_history[symbol_id].append(commit_info) + + if commit_previous: + #If not last commit + self.repo.checkout_tree(commit_previous,strategy=CheckoutStrategy.FORCE) + self.repo.set_head(commit_previous.id) + files = [self.codebase.get_file(fp) for fp in sync_past_filepaths] + exclude_state_files=[] + for file in files: + if not isinstance(file,SourceFile): + #What kind of pyfiles are not source files? To investigate! + exclude_state_files.append(file.filepath) + continue + file.sync_with_file_content() + self._process_symbol_location_state([fp for fp in sync_past_filepaths if fp not in exclude_state_files]) + + finally: + print("Finished, restoring git repo state...") + self.repo.checkout(self.org_branch_reference,strategy=CheckoutStrategy.FORCE) + + print(f"Restored, newest commit id in repo is {self.repo.revparse_single(self.org_branch_reference.name).id}") + + + + end_time = time.time() + elapsed_total = end_time - start_time + elapsed_symbol_tracking = end_time-symbol_tracking_checkpoint + print(f"Finished symbol tracking in {elapsed_symbol_tracking:.2f} seconds.") + print(f"Finished mapping symbols in {elapsed_total:.2f} seconds.") def get_symbol_history(self, symbol: Symbol) -> list[dict]: """Get the edit history for a symbol. From 7d6ecd14318a2690de65d5e8ea84d467ead9028a Mon Sep 17 00:00:00 2001 From: tomcodegen Date: Mon, 10 Mar 2025 11:03:06 -0700 Subject: [PATCH 6/7] stash before run --- .../extensions/attribution/git_history.py | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/src/codegen/extensions/attribution/git_history.py b/src/codegen/extensions/attribution/git_history.py index c66fa5f07..06450c32f 100644 --- a/src/codegen/extensions/attribution/git_history.py +++ b/src/codegen/extensions/attribution/git_history.py @@ -28,7 +28,6 @@ def __init__(self, codebase: Codebase, ai_authors: Optional[list[str]] = None): self.repo_path = codebase.ctx.projects[0].repo_operator.repo_path self.repo = pygit2.Repository(self.repo_path) self.org_branch_reference = self.repo.head - # Default AI authors if none provided self.ai_authors = ai_authors or ["devin[bot]", "codegen[bot]"] @@ -209,7 +208,19 @@ def map_symbols_to_history(self,force=False) -> None: print("Mapping symbols to git history...") start_time = time.time() - print("Turning off graph mapping!") + + + print("Stashing any working directory changes...") + stash_msg = f"Codegen Attribution Stash @ {datetime.now().timestamp()}" + stash_id=None + try: + stash_id = self.repo.stash(self.repo.default_signature,stash_msg,include_untracked=True) + print("Stashed!") + except KeyError as e: + print("Nothing to stash, proceeding.....") + except Exception as e: + print("Error encountered attempting to stash the current working state, stopping to preserve work, please manually clean the working directory and try again!") + raise(e) print("Generating initial symbol state...") filepaths = [file.filepath for file in self.codebase.files] @@ -274,7 +285,27 @@ def map_symbols_to_history(self,force=False) -> None: print("Finished, restoring git repo state...") self.repo.checkout(self.org_branch_reference,strategy=CheckoutStrategy.FORCE) - print(f"Restored, newest commit id in repo is {self.repo.revparse_single(self.org_branch_reference.name).id}") + print(f"Restored to latest commit, newest commit id in repo is {self.repo.revparse_single(self.org_branch_reference.name).id}") + + if stash_id: + #Restoring Working Directory + print("Restoring working directory changes...") + found_stash=None + for idx,stash in enumerate(self.repo.listall_stashes()): + if stash_msg in stash.message: + found_stash=idx + break + if found_stash==0: + print("Applying stash..") + self.repo.stash_apply(0,reinstate_index=True) + print("Applied Stash") + self.repo.stash_drop(0) + print("Stash Removed!") + else: + print("Another stash occured in the meantime,please handle stash resotration manually") + print(f"Codebase stash index:{found_stash}") + print(f"Codebase stash msg:{stash_msg}") + print(f"Codebase stash oid:{stash_id}") From c5ad1cd824d2aa80236bd7510a04cf1a4597512c Mon Sep 17 00:00:00 2001 From: vishalshenoy <34020235+vishalshenoy@users.noreply.github.com> Date: Tue, 11 Mar 2025 20:53:11 +0000 Subject: [PATCH 7/7] Automated pre-commit update --- docs/mint.json | 756 +++++++++--------- .../extensions/attribution/git_history.py | 93 +-- 2 files changed, 420 insertions(+), 429 deletions(-) diff --git a/docs/mint.json b/docs/mint.json index ab4b0c49a..cec15a787 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -1,381 +1,379 @@ { - "$schema": "https://mintlify.com/schema.json", - "name": "Codegen", - "logo": { - "dark": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45a3e32761c42b324b_Codegen_Logomark_Dark.svg", - "light": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45bf55446746125835_Codegen_Logomark_Light.svg" - }, - "modeToggle": { - "default": "dark" - }, - "metadata": { - "og:site_name": "Codegen", - "og:title": "Codegen - Manipulate Code at Scale", - "og:description": "A scriptable interface to a powerful, multi-lingual language server built on top of Tree-sitter.", - "og:url": "https://docs.codegen.com", - "og:locale": "en_US", - "og:logo": "https://i.imgur.com/f4OVOqI.png", - "article:publisher": "Codegen, Inc.", - "twitter:site": "@codegen" - }, - "favicon": "/favicon.svg", - "colors": { - "primary": "#a277ff", - "light": "#a277ff", - "dark": "#a277ff", - "anchors": { - "from": "#61ffca", - "to": "#61ffca" - } - }, - "theme": "prism", - "background": { - "style": "gradient" - }, - "analytics": { - "posthog": { - "apiKey": "phc_GLxaINoQJnuyCyxDmTciQqzdKBYFVDkY7bRBO4bDdso" - } - }, - "feedback": { - "thumbsRating": true - }, - "topbarCtaButton": { - "name": "GitHub", - "url": "https://github.com/codegen-sh/codegen-sdk" - }, - "tabs": [ - { - "name": "API Reference", - "url": "/api-reference" - }, - { - "name": "CLI", - "url": "/cli" - }, - { - "name": "Blog", - "url": "/blog" - }, - { - "name": "Changelog", - "url": "/changelog" - } - ], - "navigation": [ - { - "group": "Introduction", - "pages": [ - "introduction/overview", - "introduction/getting-started", - "introduction/installation", - "introduction/ide-usage", - "introduction/work-with-ai", - "introduction/how-it-works", - "introduction/advanced-settings", - "introduction/guiding-principles", - "introduction/community", - "introduction/about", - "introduction/faq" - ] - }, - { - "group": "Tutorials", - "pages": [ - "tutorials/at-a-glance", - "tutorials/build-code-agent", - "tutorials/slack-bot", - "tutorials/github-review-bot", - "tutorials/deep-code-research", - "tutorials/training-data", - "tutorials/codebase-visualization", - "tutorials/migrating-apis", - "tutorials/organize-your-codebase", - "tutorials/promise-to-async-await", - "tutorials/modularity", - "tutorials/manage-feature-flags", - "tutorials/deleting-dead-code", - "tutorials/increase-type-coverage", - "tutorials/managing-typescript-exports", - "tutorials/converting-default-exports", - "tutorials/creating-documentation", - "tutorials/react-modernization", - "tutorials/unittest-to-pytest", - "tutorials/sqlalchemy-1.6-to-2.0", - "tutorials/fixing-import-loops-in-pytorch", - "tutorials/python2-to-python3", - "tutorials/flask-to-fastapi", - "tutorials/build-mcp", - "tutorials/neo4j-graph" - ] - }, - { - "group": "Building with Codegen", - "pages": [ - "building-with-codegen/at-a-glance", - "building-with-codegen/parsing-codebases", - "building-with-codegen/reusable-codemods", - "building-with-codegen/dot-codegen", - "building-with-codegen/function-decorator", - "building-with-codegen/language-support", - "building-with-codegen/commit-and-reset", - "building-with-codegen/git-operations", - "building-with-codegen/files-and-directories", - "building-with-codegen/the-editable-api", - "building-with-codegen/symbol-api", - "building-with-codegen/class-api", - "building-with-codegen/imports", - "building-with-codegen/exports", - "building-with-codegen/inheritable-behaviors", - "building-with-codegen/statements-and-code-blocks", - "building-with-codegen/dependencies-and-usages", - "building-with-codegen/function-calls-and-callsites", - "building-with-codegen/variable-assignments", - "building-with-codegen/local-variables", - "building-with-codegen/comments-and-docstrings", - "building-with-codegen/external-modules", - "building-with-codegen/type-annotations", - "building-with-codegen/moving-symbols", - "building-with-codegen/collections", - "building-with-codegen/traversing-the-call-graph", - "building-with-codegen/react-and-jsx", - "building-with-codegen/codebase-visualization", - "building-with-codegen/flagging-symbols", - "building-with-codegen/calling-out-to-llms", - "building-with-codegen/semantic-code-search", - "building-with-codegen/reducing-conditions" - ] - }, - { - "group": "CLI", - "pages": [ - "cli/about", - "cli/init", - "cli/notebook", - "cli/create", - "cli/run", - "cli/reset", - "cli/expert" - ] - }, - { - "group": "Changelog", - "pages": [ - "changelog/changelog" - ] - }, - { - "group": "Blog", - "pages": [ - "blog/posts", - "blog/devin", - "blog/act-via-code", - "blog/promise-to-async-await-twilio", - "blog/fixing-import-loops" - ] - }, - { - "group": "API Reference", - "pages": [ - "api-reference/index", - { - "group": "Core", - "icon": "code", - "pages": [ - "api-reference/core/Argument", - "api-reference/core/Assignment", - "api-reference/core/AssignmentStatement", - "api-reference/core/Attribute", - "api-reference/core/AwaitExpression", - "api-reference/core/BinaryExpression", - "api-reference/core/BlockStatement", - "api-reference/core/Boolean", - "api-reference/core/Callable", - "api-reference/core/CatchStatement", - "api-reference/core/ChainedAttribute", - "api-reference/core/Class", - "api-reference/core/CodeBlock", - "api-reference/core/CodeOwner", - "api-reference/core/Codebase", - "api-reference/core/Comment", - "api-reference/core/CommentGroup", - "api-reference/core/ComparisonExpression", - "api-reference/core/Decorator", - "api-reference/core/Dict", - "api-reference/core/Directory", - "api-reference/core/Editable", - "api-reference/core/Export", - "api-reference/core/ExportStatement", - "api-reference/core/Exportable", - "api-reference/core/Expression", - "api-reference/core/ExpressionGroup", - "api-reference/core/ExpressionStatement", - "api-reference/core/ExternalModule", - "api-reference/core/File", - "api-reference/core/FlagKwargs", - "api-reference/core/ForLoopStatement", - "api-reference/core/Function", - "api-reference/core/FunctionCall", - "api-reference/core/GenericType", - "api-reference/core/HasBlock", - "api-reference/core/HasName", - "api-reference/core/HasValue", - "api-reference/core/IfBlockStatement", - "api-reference/core/Import", - "api-reference/core/ImportStatement", - "api-reference/core/ImportType", - "api-reference/core/Importable", - "api-reference/core/Interface", - "api-reference/core/List", - "api-reference/core/MessageType", - "api-reference/core/MultiExpression", - "api-reference/core/MultiLineCollection", - "api-reference/core/Name", - "api-reference/core/NamedType", - "api-reference/core/NoneType", - "api-reference/core/Number", - "api-reference/core/Pair", - "api-reference/core/Parameter", - "api-reference/core/ParenthesizedExpression", - "api-reference/core/Placeholder", - "api-reference/core/PlaceholderType", - "api-reference/core/RaiseStatement", - "api-reference/core/ReturnStatement", - "api-reference/core/SourceFile", - "api-reference/core/Span", - "api-reference/core/Statement", - "api-reference/core/StatementType", - "api-reference/core/String", - "api-reference/core/StubPlaceholder", - "api-reference/core/SubscriptExpression", - "api-reference/core/SwitchCase", - "api-reference/core/SwitchStatement", - "api-reference/core/Symbol", - "api-reference/core/SymbolGroup", - "api-reference/core/SymbolStatement", - "api-reference/core/TernaryExpression", - "api-reference/core/TryCatchStatement", - "api-reference/core/Tuple", - "api-reference/core/TupleType", - "api-reference/core/Type", - "api-reference/core/TypeAlias", - "api-reference/core/TypePlaceholder", - "api-reference/core/Typeable", - "api-reference/core/UnaryExpression", - "api-reference/core/UnionType", - "api-reference/core/Unpack", - "api-reference/core/Unwrappable", - "api-reference/core/Usable", - "api-reference/core/Usage", - "api-reference/core/UsageKind", - "api-reference/core/UsageType", - "api-reference/core/Value", - "api-reference/core/WhileStatement", - "api-reference/core/WithStatement" - ] - }, - { - "group": "Python", - "icon": "python", - "pages": [ - "api-reference/python/PyAssignment", - "api-reference/python/PyAssignmentStatement", - "api-reference/python/PyAttribute", - "api-reference/python/PyBlockStatement", - "api-reference/python/PyBreakStatement", - "api-reference/python/PyCatchStatement", - "api-reference/python/PyChainedAttribute", - "api-reference/python/PyClass", - "api-reference/python/PyCodeBlock", - "api-reference/python/PyComment", - "api-reference/python/PyCommentGroup", - "api-reference/python/PyCommentType", - "api-reference/python/PyConditionalExpression", - "api-reference/python/PyDecorator", - "api-reference/python/PyFile", - "api-reference/python/PyForLoopStatement", - "api-reference/python/PyFunction", - "api-reference/python/PyGenericType", - "api-reference/python/PyHasBlock", - "api-reference/python/PyIfBlockStatement", - "api-reference/python/PyImport", - "api-reference/python/PyImportStatement", - "api-reference/python/PyMatchCase", - "api-reference/python/PyMatchStatement", - "api-reference/python/PyNamedType", - "api-reference/python/PyParameter", - "api-reference/python/PyPassStatement", - "api-reference/python/PyReturnTypePlaceholder", - "api-reference/python/PyString", - "api-reference/python/PySymbol", - "api-reference/python/PyTryCatchStatement", - "api-reference/python/PyUnionType", - "api-reference/python/PyWhileStatement" - ] - }, - { - "group": "Typescript", - "icon": "js", - "pages": [ - "api-reference/typescript/JSXElement", - "api-reference/typescript/JSXExpression", - "api-reference/typescript/JSXProp", - "api-reference/typescript/TSArrayType", - "api-reference/typescript/TSAssignment", - "api-reference/typescript/TSAssignmentStatement", - "api-reference/typescript/TSAttribute", - "api-reference/typescript/TSBlockStatement", - "api-reference/typescript/TSCatchStatement", - "api-reference/typescript/TSChainedAttribute", - "api-reference/typescript/TSClass", - "api-reference/typescript/TSCodeBlock", - "api-reference/typescript/TSComment", - "api-reference/typescript/TSCommentGroup", - "api-reference/typescript/TSCommentType", - "api-reference/typescript/TSConditionalType", - "api-reference/typescript/TSConfig", - "api-reference/typescript/TSDecorator", - "api-reference/typescript/TSDict", - "api-reference/typescript/TSEnum", - "api-reference/typescript/TSExport", - "api-reference/typescript/TSExpressionType", - "api-reference/typescript/TSFile", - "api-reference/typescript/TSForLoopStatement", - "api-reference/typescript/TSFunction", - "api-reference/typescript/TSFunctionType", - "api-reference/typescript/TSGenericType", - "api-reference/typescript/TSHasBlock", - "api-reference/typescript/TSIfBlockStatement", - "api-reference/typescript/TSImport", - "api-reference/typescript/TSImportStatement", - "api-reference/typescript/TSInterface", - "api-reference/typescript/TSLabeledStatement", - "api-reference/typescript/TSLookupType", - "api-reference/typescript/TSNamedType", - "api-reference/typescript/TSNamespace", - "api-reference/typescript/TSObjectType", - "api-reference/typescript/TSPair", - "api-reference/typescript/TSParameter", - "api-reference/typescript/TSQueryType", - "api-reference/typescript/TSReadonlyType", - "api-reference/typescript/TSReturnTypePlaceholder", - "api-reference/typescript/TSString", - "api-reference/typescript/TSSwitchCase", - "api-reference/typescript/TSSwitchStatement", - "api-reference/typescript/TSSymbol", - "api-reference/typescript/TSTernaryExpression", - "api-reference/typescript/TSTryCatchStatement", - "api-reference/typescript/TSTypeAlias", - "api-reference/typescript/TSUndefinedType", - "api-reference/typescript/TSUnionType", - "api-reference/typescript/TSWhileStatement" - ] - } - ] - } - ], - "footerSocials": { - "x": "https://x.com/codegen", - "linkedin": "https://linkedin.com/company/codegen-dot-com" - } + "$schema": "https://mintlify.com/schema.json", + "name": "Codegen", + "logo": { + "dark": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45a3e32761c42b324b_Codegen_Logomark_Dark.svg", + "light": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45bf55446746125835_Codegen_Logomark_Light.svg" + }, + "modeToggle": { + "default": "dark" + }, + "metadata": { + "og:site_name": "Codegen", + "og:title": "Codegen - Manipulate Code at Scale", + "og:description": "A scriptable interface to a powerful, multi-lingual language server built on top of Tree-sitter.", + "og:url": "https://docs.codegen.com", + "og:locale": "en_US", + "og:logo": "https://i.imgur.com/f4OVOqI.png", + "article:publisher": "Codegen, Inc.", + "twitter:site": "@codegen" + }, + "favicon": "/favicon.svg", + "colors": { + "primary": "#a277ff", + "light": "#a277ff", + "dark": "#a277ff", + "anchors": { + "from": "#61ffca", + "to": "#61ffca" + } + }, + "theme": "prism", + "background": { + "style": "gradient" + }, + "analytics": { + "posthog": { + "apiKey": "phc_GLxaINoQJnuyCyxDmTciQqzdKBYFVDkY7bRBO4bDdso" + } + }, + "feedback": { + "thumbsRating": true + }, + "topbarCtaButton": { + "name": "GitHub", + "url": "https://github.com/codegen-sh/codegen-sdk" + }, + "tabs": [ + { + "name": "API Reference", + "url": "/api-reference" + }, + { + "name": "CLI", + "url": "/cli" + }, + { + "name": "Blog", + "url": "/blog" + }, + { + "name": "Changelog", + "url": "/changelog" + } + ], + "navigation": [ + { + "group": "Introduction", + "pages": [ + "introduction/overview", + "introduction/getting-started", + "introduction/installation", + "introduction/ide-usage", + "introduction/work-with-ai", + "introduction/how-it-works", + "introduction/advanced-settings", + "introduction/guiding-principles", + "introduction/community", + "introduction/about", + "introduction/faq" + ] + }, + { + "group": "Tutorials", + "pages": [ + "tutorials/at-a-glance", + "tutorials/build-code-agent", + "tutorials/slack-bot", + "tutorials/github-review-bot", + "tutorials/deep-code-research", + "tutorials/training-data", + "tutorials/codebase-visualization", + "tutorials/migrating-apis", + "tutorials/organize-your-codebase", + "tutorials/promise-to-async-await", + "tutorials/modularity", + "tutorials/manage-feature-flags", + "tutorials/deleting-dead-code", + "tutorials/increase-type-coverage", + "tutorials/managing-typescript-exports", + "tutorials/converting-default-exports", + "tutorials/creating-documentation", + "tutorials/react-modernization", + "tutorials/unittest-to-pytest", + "tutorials/sqlalchemy-1.6-to-2.0", + "tutorials/fixing-import-loops-in-pytorch", + "tutorials/python2-to-python3", + "tutorials/flask-to-fastapi", + "tutorials/build-mcp", + "tutorials/neo4j-graph" + ] + }, + { + "group": "Building with Codegen", + "pages": [ + "building-with-codegen/at-a-glance", + "building-with-codegen/parsing-codebases", + "building-with-codegen/reusable-codemods", + "building-with-codegen/dot-codegen", + "building-with-codegen/function-decorator", + "building-with-codegen/language-support", + "building-with-codegen/commit-and-reset", + "building-with-codegen/git-operations", + "building-with-codegen/files-and-directories", + "building-with-codegen/the-editable-api", + "building-with-codegen/symbol-api", + "building-with-codegen/class-api", + "building-with-codegen/imports", + "building-with-codegen/exports", + "building-with-codegen/inheritable-behaviors", + "building-with-codegen/statements-and-code-blocks", + "building-with-codegen/dependencies-and-usages", + "building-with-codegen/function-calls-and-callsites", + "building-with-codegen/variable-assignments", + "building-with-codegen/local-variables", + "building-with-codegen/comments-and-docstrings", + "building-with-codegen/external-modules", + "building-with-codegen/type-annotations", + "building-with-codegen/moving-symbols", + "building-with-codegen/collections", + "building-with-codegen/traversing-the-call-graph", + "building-with-codegen/react-and-jsx", + "building-with-codegen/codebase-visualization", + "building-with-codegen/flagging-symbols", + "building-with-codegen/calling-out-to-llms", + "building-with-codegen/semantic-code-search", + "building-with-codegen/reducing-conditions" + ] + }, + { + "group": "CLI", + "pages": [ + "cli/about", + "cli/init", + "cli/notebook", + "cli/create", + "cli/run", + "cli/reset", + "cli/expert" + ] + }, + { + "group": "Changelog", + "pages": ["changelog/changelog"] + }, + { + "group": "Blog", + "pages": [ + "blog/posts", + "blog/devin", + "blog/act-via-code", + "blog/promise-to-async-await-twilio", + "blog/fixing-import-loops" + ] + }, + { + "group": "API Reference", + "pages": [ + "api-reference/index", + { + "group": "Core", + "icon": "code", + "pages": [ + "api-reference/core/Argument", + "api-reference/core/Assignment", + "api-reference/core/AssignmentStatement", + "api-reference/core/Attribute", + "api-reference/core/AwaitExpression", + "api-reference/core/BinaryExpression", + "api-reference/core/BlockStatement", + "api-reference/core/Boolean", + "api-reference/core/Callable", + "api-reference/core/CatchStatement", + "api-reference/core/ChainedAttribute", + "api-reference/core/Class", + "api-reference/core/CodeBlock", + "api-reference/core/CodeOwner", + "api-reference/core/Codebase", + "api-reference/core/Comment", + "api-reference/core/CommentGroup", + "api-reference/core/ComparisonExpression", + "api-reference/core/Decorator", + "api-reference/core/Dict", + "api-reference/core/Directory", + "api-reference/core/Editable", + "api-reference/core/Export", + "api-reference/core/ExportStatement", + "api-reference/core/Exportable", + "api-reference/core/Expression", + "api-reference/core/ExpressionGroup", + "api-reference/core/ExpressionStatement", + "api-reference/core/ExternalModule", + "api-reference/core/File", + "api-reference/core/FlagKwargs", + "api-reference/core/ForLoopStatement", + "api-reference/core/Function", + "api-reference/core/FunctionCall", + "api-reference/core/GenericType", + "api-reference/core/HasBlock", + "api-reference/core/HasName", + "api-reference/core/HasValue", + "api-reference/core/IfBlockStatement", + "api-reference/core/Import", + "api-reference/core/ImportStatement", + "api-reference/core/ImportType", + "api-reference/core/Importable", + "api-reference/core/Interface", + "api-reference/core/List", + "api-reference/core/MessageType", + "api-reference/core/MultiExpression", + "api-reference/core/MultiLineCollection", + "api-reference/core/Name", + "api-reference/core/NamedType", + "api-reference/core/NoneType", + "api-reference/core/Number", + "api-reference/core/Pair", + "api-reference/core/Parameter", + "api-reference/core/ParenthesizedExpression", + "api-reference/core/Placeholder", + "api-reference/core/PlaceholderType", + "api-reference/core/RaiseStatement", + "api-reference/core/ReturnStatement", + "api-reference/core/SourceFile", + "api-reference/core/Span", + "api-reference/core/Statement", + "api-reference/core/StatementType", + "api-reference/core/String", + "api-reference/core/StubPlaceholder", + "api-reference/core/SubscriptExpression", + "api-reference/core/SwitchCase", + "api-reference/core/SwitchStatement", + "api-reference/core/Symbol", + "api-reference/core/SymbolGroup", + "api-reference/core/SymbolStatement", + "api-reference/core/TernaryExpression", + "api-reference/core/TryCatchStatement", + "api-reference/core/Tuple", + "api-reference/core/TupleType", + "api-reference/core/Type", + "api-reference/core/TypeAlias", + "api-reference/core/TypePlaceholder", + "api-reference/core/Typeable", + "api-reference/core/UnaryExpression", + "api-reference/core/UnionType", + "api-reference/core/Unpack", + "api-reference/core/Unwrappable", + "api-reference/core/Usable", + "api-reference/core/Usage", + "api-reference/core/UsageKind", + "api-reference/core/UsageType", + "api-reference/core/Value", + "api-reference/core/WhileStatement", + "api-reference/core/WithStatement" + ] + }, + { + "group": "Python", + "icon": "python", + "pages": [ + "api-reference/python/PyAssignment", + "api-reference/python/PyAssignmentStatement", + "api-reference/python/PyAttribute", + "api-reference/python/PyBlockStatement", + "api-reference/python/PyBreakStatement", + "api-reference/python/PyCatchStatement", + "api-reference/python/PyChainedAttribute", + "api-reference/python/PyClass", + "api-reference/python/PyCodeBlock", + "api-reference/python/PyComment", + "api-reference/python/PyCommentGroup", + "api-reference/python/PyCommentType", + "api-reference/python/PyConditionalExpression", + "api-reference/python/PyDecorator", + "api-reference/python/PyFile", + "api-reference/python/PyForLoopStatement", + "api-reference/python/PyFunction", + "api-reference/python/PyGenericType", + "api-reference/python/PyHasBlock", + "api-reference/python/PyIfBlockStatement", + "api-reference/python/PyImport", + "api-reference/python/PyImportStatement", + "api-reference/python/PyMatchCase", + "api-reference/python/PyMatchStatement", + "api-reference/python/PyNamedType", + "api-reference/python/PyParameter", + "api-reference/python/PyPassStatement", + "api-reference/python/PyReturnTypePlaceholder", + "api-reference/python/PyString", + "api-reference/python/PySymbol", + "api-reference/python/PyTryCatchStatement", + "api-reference/python/PyUnionType", + "api-reference/python/PyWhileStatement" + ] + }, + { + "group": "Typescript", + "icon": "js", + "pages": [ + "api-reference/typescript/JSXElement", + "api-reference/typescript/JSXExpression", + "api-reference/typescript/JSXProp", + "api-reference/typescript/TSArrayType", + "api-reference/typescript/TSAssignment", + "api-reference/typescript/TSAssignmentStatement", + "api-reference/typescript/TSAttribute", + "api-reference/typescript/TSBlockStatement", + "api-reference/typescript/TSCatchStatement", + "api-reference/typescript/TSChainedAttribute", + "api-reference/typescript/TSClass", + "api-reference/typescript/TSCodeBlock", + "api-reference/typescript/TSComment", + "api-reference/typescript/TSCommentGroup", + "api-reference/typescript/TSCommentType", + "api-reference/typescript/TSConditionalType", + "api-reference/typescript/TSConfig", + "api-reference/typescript/TSDecorator", + "api-reference/typescript/TSDict", + "api-reference/typescript/TSEnum", + "api-reference/typescript/TSExport", + "api-reference/typescript/TSExpressionType", + "api-reference/typescript/TSFile", + "api-reference/typescript/TSForLoopStatement", + "api-reference/typescript/TSFunction", + "api-reference/typescript/TSFunctionType", + "api-reference/typescript/TSGenericType", + "api-reference/typescript/TSHasBlock", + "api-reference/typescript/TSIfBlockStatement", + "api-reference/typescript/TSImport", + "api-reference/typescript/TSImportStatement", + "api-reference/typescript/TSInterface", + "api-reference/typescript/TSLabeledStatement", + "api-reference/typescript/TSLookupType", + "api-reference/typescript/TSNamedType", + "api-reference/typescript/TSNamespace", + "api-reference/typescript/TSObjectType", + "api-reference/typescript/TSPair", + "api-reference/typescript/TSParameter", + "api-reference/typescript/TSQueryType", + "api-reference/typescript/TSReadonlyType", + "api-reference/typescript/TSReturnTypePlaceholder", + "api-reference/typescript/TSString", + "api-reference/typescript/TSSwitchCase", + "api-reference/typescript/TSSwitchStatement", + "api-reference/typescript/TSSymbol", + "api-reference/typescript/TSTernaryExpression", + "api-reference/typescript/TSTryCatchStatement", + "api-reference/typescript/TSTypeAlias", + "api-reference/typescript/TSUndefinedType", + "api-reference/typescript/TSUnionType", + "api-reference/typescript/TSWhileStatement" + ] + } + ] + } + ], + "footerSocials": { + "x": "https://x.com/codegen", + "linkedin": "https://linkedin.com/company/codegen-dot-com" + } } diff --git a/src/codegen/extensions/attribution/git_history.py b/src/codegen/extensions/attribution/git_history.py index 06450c32f..39dfcc740 100644 --- a/src/codegen/extensions/attribution/git_history.py +++ b/src/codegen/extensions/attribution/git_history.py @@ -33,15 +33,15 @@ def __init__(self, codebase: Codebase, ai_authors: Optional[list[str]] = None): # Cache structures self._file_history = {} # file path -> list of commit info - self._symbol_history:defaultdict[str,list] = defaultdict(list) # symbol id -> list of commit info + self._symbol_history: defaultdict[str, list] = defaultdict(list) # symbol id -> list of commit info self._author_contributions = defaultdict(list) # author -> list of commit info # Track if history has been built self._history_built = False - self._file_symbol_location_state:dict[str,IntervalTree] = {} + self._file_symbol_location_state: dict[str, IntervalTree] = {} - self._commits:deque[Commit] + self._commits: deque[Commit] def build_history(self, max_commits: Optional[int] = None) -> None: """Build the git history for the codebase. @@ -65,7 +65,7 @@ def build_history(self, max_commits: Optional[int] = None) -> None: commit_count = 0 author_set = set() - self._commits=deque() + self._commits = deque() try: for commit in self.repo.walk(self.repo.head.target, SortMode.TIME): # Track unique authors @@ -153,35 +153,32 @@ def _process_commit(self, commit, diff) -> None: file_commit["file_path"] = file_path self._file_history[file_path].append(file_commit) - - def _process_symbol_location_state(self, filepaths:list[str]): + def _process_symbol_location_state(self, filepaths: list[str]): for filepath in filepaths: file = self.codebase.get_file(filepath) filetree = IntervalTree() try: for symbol in file.symbols: - symbol:Symbol - start_line=symbol.range.start_point.row+1 # 1 Indexing - end_line=symbol.range.end_point.row+2 # Intervaltree is end non-inclusive - filetree.addi(start_line,end_line,symbol) + symbol: Symbol + start_line = symbol.range.start_point.row + 1 # 1 Indexing + end_line = symbol.range.end_point.row + 2 # Intervaltree is end non-inclusive + filetree.addi(start_line, end_line, symbol) except Exception as e: pass self._file_symbol_location_state[filepath] = filetree - def _get_symbols_affected_by_patch(self,patch:Patch,filepath): + def _get_symbols_affected_by_patch(self, patch: Patch, filepath): if filepath not in self._file_symbol_location_state: return [] - symbols_affected=set() + symbols_affected = set() for hunk in patch.hunks: start = hunk.new_start - end = start+hunk.new_lines # Intervaltree is end non-inclusive - for interval in self._file_symbol_location_state[filepath].overlap(start,end): + end = start + hunk.new_lines # Intervaltree is end non-inclusive + for interval in self._file_symbol_location_state[filepath].overlap(start, end): symbols_affected.add(interval[2]) return symbols_affected - - def _is_tracked_file(self, file_path: str) -> bool: """Check if a file should be tracked based on extension.""" # Get file extensions from the codebase @@ -198,7 +195,7 @@ def _ensure_history_built(self) -> None: if not self._history_built: self.build_history() - def map_symbols_to_history(self,force=False) -> None: + def map_symbols_to_history(self, force=False) -> None: """Map symbols in the codebase to their git history. force ensures a rerun even if data is already found!""" self._ensure_history_built() if self._symbol_history: @@ -208,19 +205,17 @@ def map_symbols_to_history(self,force=False) -> None: print("Mapping symbols to git history...") start_time = time.time() - - print("Stashing any working directory changes...") stash_msg = f"Codegen Attribution Stash @ {datetime.now().timestamp()}" - stash_id=None + stash_id = None try: - stash_id = self.repo.stash(self.repo.default_signature,stash_msg,include_untracked=True) + stash_id = self.repo.stash(self.repo.default_signature, stash_msg, include_untracked=True) print("Stashed!") except KeyError as e: print("Nothing to stash, proceeding.....") except Exception as e: print("Error encountered attempting to stash the current working state, stopping to preserve work, please manually clean the working directory and try again!") - raise(e) + raise (e) print("Generating initial symbol state...") filepaths = [file.filepath for file in self.codebase.files] @@ -228,7 +223,7 @@ def map_symbols_to_history(self,force=False) -> None: elapsed = time.time() - start_time print(f"Finished initial symbol state generation in {elapsed:.2f} seconds.") - symbol_tracking_checkpoint=time.time() + symbol_tracking_checkpoint = time.time() try: print("Starting symbol tracking procedure....") for commit in self._commits: @@ -246,36 +241,36 @@ def map_symbols_to_history(self,force=False) -> None: } commit_previous = commit.parents[0] if commit.parents else None if not commit_previous: - #If Last commit + # If Last commit empty_tree_old = self.repo.TreeBuilder().write() - empty_tree=self.repo.get(empty_tree_old) - diff = self.repo.diff(empty_tree,commit.tree) + empty_tree = self.repo.get(empty_tree_old) + diff = self.repo.diff(empty_tree, commit.tree) else: - diff = self.repo.diff(commit_previous, commit,context_lines=0) #We don't need context lines + diff = self.repo.diff(commit_previous, commit, context_lines=0) # We don't need context lines - if isinstance(diff,Patch): - diff=[diff] - sync_past_filepaths=[] #Files to sync in the past commit + if isinstance(diff, Patch): + diff = [diff] + sync_past_filepaths = [] # Files to sync in the past commit for patch in diff: - filepath=patch.delta.new_file.path + filepath = patch.delta.new_file.path if not self._is_tracked_file(filepath): - continue #Ignore files we don't track - if not patch.delta.status==DeltaStatus.ADDED: #Reversed since we're going backwards, if it doesn't exist in the past commits don't sync! + continue # Ignore files we don't track + if not patch.delta.status == DeltaStatus.ADDED: # Reversed since we're going backwards, if it doesn't exist in the past commits don't sync! sync_past_filepaths.append(filepath) - symbols_affected = self._get_symbols_affected_by_patch(patch,filepath) + symbols_affected = self._get_symbols_affected_by_patch(patch, filepath) for symbol in symbols_affected: - symbol_id = f"{symbol.filepath}:{symbol.name}" #For future stuff might want to do this more neatly and allow for future dead symbols/renames + symbol_id = f"{symbol.filepath}:{symbol.name}" # For future stuff might want to do this more neatly and allow for future dead symbols/renames self._symbol_history[symbol_id].append(commit_info) if commit_previous: - #If not last commit - self.repo.checkout_tree(commit_previous,strategy=CheckoutStrategy.FORCE) + # If not last commit + self.repo.checkout_tree(commit_previous, strategy=CheckoutStrategy.FORCE) self.repo.set_head(commit_previous.id) files = [self.codebase.get_file(fp) for fp in sync_past_filepaths] - exclude_state_files=[] + exclude_state_files = [] for file in files: - if not isinstance(file,SourceFile): - #What kind of pyfiles are not source files? To investigate! + if not isinstance(file, SourceFile): + # What kind of pyfiles are not source files? To investigate! exclude_state_files.append(file.filepath) continue file.sync_with_file_content() @@ -283,21 +278,21 @@ def map_symbols_to_history(self,force=False) -> None: finally: print("Finished, restoring git repo state...") - self.repo.checkout(self.org_branch_reference,strategy=CheckoutStrategy.FORCE) + self.repo.checkout(self.org_branch_reference, strategy=CheckoutStrategy.FORCE) print(f"Restored to latest commit, newest commit id in repo is {self.repo.revparse_single(self.org_branch_reference.name).id}") if stash_id: - #Restoring Working Directory + # Restoring Working Directory print("Restoring working directory changes...") - found_stash=None - for idx,stash in enumerate(self.repo.listall_stashes()): + found_stash = None + for idx, stash in enumerate(self.repo.listall_stashes()): if stash_msg in stash.message: - found_stash=idx + found_stash = idx break - if found_stash==0: + if found_stash == 0: print("Applying stash..") - self.repo.stash_apply(0,reinstate_index=True) + self.repo.stash_apply(0, reinstate_index=True) print("Applied Stash") self.repo.stash_drop(0) print("Stash Removed!") @@ -307,11 +302,9 @@ def map_symbols_to_history(self,force=False) -> None: print(f"Codebase stash msg:{stash_msg}") print(f"Codebase stash oid:{stash_id}") - - end_time = time.time() elapsed_total = end_time - start_time - elapsed_symbol_tracking = end_time-symbol_tracking_checkpoint + elapsed_symbol_tracking = end_time - symbol_tracking_checkpoint print(f"Finished symbol tracking in {elapsed_symbol_tracking:.2f} seconds.") print(f"Finished mapping symbols in {elapsed_total:.2f} seconds.")