mozilla-conduit · Copilot · Jan 5, 2026 · Jan 5, 2026 · Jan 5, 2026 · Jan 5, 2026
diff --git a/README.md b/README.md
@@ -55,6 +55,9 @@ docker run --rm \
 | `BIGQUERY_PROJECT` | Yes | - | Google Cloud Project ID |
 | `BIGQUERY_DATASET` | Yes | - | BigQuery dataset ID |
 | `GOOGLE_APPLICATION_CREDENTIALS` | Yes* | - | Path to GCP service account JSON file (*or use Workload Identity) |
+| `FETCH_COMMITS` | No | true | Whether to fetch commit data for each PR (set to "false" to reduce API calls) |
+| `FETCH_REVIEWERS` | No | true | Whether to fetch reviewer data for each PR (set to "false" to reduce API calls) |
+| `FETCH_COMMENTS` | No | true | Whether to fetch comment data for each PR (set to "false" to reduce API calls) |
 
 ## Architecture
 
@@ -97,6 +100,27 @@ requests in batches of 100:
 - Early failure detection
 - Supports streaming data pipelines
 
+### Performance Optimization
+
+By default, the ETL fetches commits, reviewers, and comments for each pull request, which creates an N+1 query pattern. For 250 PRs, this results in 250 + (250×3) = 1000 API calls.
+
+To improve performance when this additional data is not needed, you can disable fetching of specific data types using environment variables:
+
+```bash
+# Disable all extra data fetching (only fetch PR metadata)
+FETCH_COMMITS=false FETCH_REVIEWERS=false FETCH_COMMENTS=false python3 main.py
+
+# Disable only commits and comments (still fetch reviewers)
+FETCH_COMMITS=false FETCH_COMMENTS=false python3 main.py
+```
+
+This can significantly reduce:
+- **API calls**: From 1000 to 250 for 250 PRs (when all extras are disabled)
+- **Processing time**: Proportional to the reduction in API calls
+- **Rate limit pressure**: Fewer calls mean you stay within GitHub's rate limits longer
+
+The corresponding BigQuery tables will still be populated, but with empty arrays for disabled data types.
+
 ## BigQuery Table Schema
 
 Before running the ETL, create a BigQuery table with the following schema:

diff --git a/main.py b/main.py
@@ -36,6 +36,9 @@ def extract_pull_requests(
     repo: str,
     chunk_size: int = 100,
     github_api_url: Optional[str] = None,
+    fetch_commits: bool = True,
+    fetch_reviewers: bool = True,
+    fetch_comments: bool = True,
 ) -> Iterator[list[dict]]:
     """
     Extract data from GitHub repositories in chunks.
@@ -47,6 +50,9 @@ def extract_pull_requests(
         repo: GitHub repository name
         chunk_size: Number of PRs to yield per chunk (default: 100)
         github_api_url: Optional custom GitHub API URL (for testing/mocking)
+        fetch_commits: Whether to fetch commit data for each PR (default: True)
+        fetch_reviewers: Whether to fetch reviewer data for each PR (default: True)
+        fetch_comments: Whether to fetch comment data for each PR (default: True)
 
     Yields:
         List of pull request dictionaries (up to chunk_size items)
@@ -89,15 +95,28 @@ def extract_pull_requests(
                 pr_number = pr.get("number")
                 if not pr_number:
                     continue
-                pr["commit_data"] = extract_commits(
-                    session, repo, pr_number, github_api_url
-                )
-                pr["reviewer_data"] = extract_reviewers(
-                    session, repo, pr_number, github_api_url
-                )
-                pr["comment_data"] = extract_comments(
-                    session, repo, pr_number, github_api_url
-                )
+
+                # Conditionally fetch additional data to avoid N+1 query pattern
+                if fetch_commits:
+                    pr["commit_data"] = extract_commits(
+                        session, repo, pr_number, github_api_url
+                    )
+                else:
+                    pr["commit_data"] = []
+
+                if fetch_reviewers:
+                    pr["reviewer_data"] = extract_reviewers(
+                        session, repo, pr_number, github_api_url
+                    )
+                else:
+                    pr["reviewer_data"] = {"users": [], "teams": []}
+
+                if fetch_comments:
+                    pr["comment_data"] = extract_comments(
+                        session, repo, pr_number, github_api_url
+                    )
+                else:
+                    pr["comment_data"] = []
 
             yield batch
 
@@ -154,7 +173,7 @@ def extract_reviewers(
     repo: str,
     pr_number: int,
     github_api_url: Optional[str] = None,
-) -> list[dict]:
+) -> dict:
     """
     Extract reviewers for a specific pull request.
 
@@ -164,7 +183,7 @@ def extract_reviewers(
         pr_number: Pull request number
         github_api_url: Optional custom GitHub API URL (for testing/mocking)
     Returns:
-        List of reviewer dictionaries for the pull request
+        Dictionary with 'users' and 'teams' keys containing reviewer data
     """
     logger = logging.getLogger(__name__)
     logger.info(f"Extracting reviews for PR #{pr_number}")
@@ -450,18 +469,33 @@ def main() -> int:
         bigquery_client = bigquery.Client(project=bigquery_project)
 
     # Read GitHub repository configuration
-    github_repos = os.getenv("GITHUB_REPOS").split(",")
-    if not github_repos:
+    github_repos_env = os.getenv("GITHUB_REPOS")
+    if not github_repos_env:
         raise SystemExit(
             "Environment variable GITHUB_REPOS is required (format: 'owner/repo,owner/repo')"
         )
+    github_repos = github_repos_env.split(",")
+
+    # Read optional data fetching configuration (defaults to True for backward compatibility)
+    fetch_commits = os.getenv("FETCH_COMMITS", "true").lower() in ("true", "1", "yes")
+    fetch_reviewers = os.getenv("FETCH_REVIEWERS", "true").lower() in ("true", "1", "yes")
+    fetch_comments = os.getenv("FETCH_COMMENTS", "true").lower() in ("true", "1", "yes")
+
+    if not fetch_commits:
+        logger.info("Commit data fetching is disabled")
+    if not fetch_reviewers:
+        logger.info("Reviewer data fetching is disabled")
+    if not fetch_comments:
+        logger.info("Comment data fetching is disabled")
 
     total_processed = 0
 
     for repo in github_repos:
         for chunk_count, chunk in enumerate(
             extract_pull_requests(
-                session, repo, chunk_size=100, github_api_url=github_api_url
+                session, repo, chunk_size=100, github_api_url=github_api_url,
+                fetch_commits=fetch_commits, fetch_reviewers=fetch_reviewers,
+                fetch_comments=fetch_comments
             ), start=1
         ):
             logger.info(f"Processing chunk {chunk_count} with {len(chunk)} PRs")