Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ docker run --rm \
| `BIGQUERY_PROJECT` | Yes | - | Google Cloud Project ID |
| `BIGQUERY_DATASET` | Yes | - | BigQuery dataset ID |
| `GOOGLE_APPLICATION_CREDENTIALS` | Yes* | - | Path to GCP service account JSON file (*or use Workload Identity) |
| `FETCH_COMMITS` | No | true | Whether to fetch commit data for each PR (set to "false" to reduce API calls) |
| `FETCH_REVIEWERS` | No | true | Whether to fetch reviewer data for each PR (set to "false" to reduce API calls) |
| `FETCH_COMMENTS` | No | true | Whether to fetch comment data for each PR (set to "false" to reduce API calls) |

## Architecture

Expand Down Expand Up @@ -97,6 +100,27 @@ requests in batches of 100:
- Early failure detection
- Supports streaming data pipelines

### Performance Optimization

By default, the ETL fetches commits, reviewers, and comments for each pull request, which creates an N+1 query pattern. For 250 PRs, this results in 250 + (250×3) = 1000 API calls.

To improve performance when this additional data is not needed, you can disable fetching of specific data types using environment variables:

```bash
# Disable all extra data fetching (only fetch PR metadata)
FETCH_COMMITS=false FETCH_REVIEWERS=false FETCH_COMMENTS=false python3 main.py

# Disable only commits and comments (still fetch reviewers)
FETCH_COMMITS=false FETCH_COMMENTS=false python3 main.py
```

This can significantly reduce:
- **API calls**: From 1000 to 250 for 250 PRs (when all extras are disabled)
- **Processing time**: Proportional to the reduction in API calls
- **Rate limit pressure**: Fewer calls mean you stay within GitHub's rate limits longer

The corresponding BigQuery tables will still be populated, but with empty arrays for disabled data types.

## BigQuery Table Schema

Before running the ETL, create a BigQuery table with the following schema:
Expand Down
62 changes: 48 additions & 14 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ def extract_pull_requests(
repo: str,
chunk_size: int = 100,
github_api_url: Optional[str] = None,
fetch_commits: bool = True,
fetch_reviewers: bool = True,
fetch_comments: bool = True,
) -> Iterator[list[dict]]:
"""
Extract data from GitHub repositories in chunks.
Expand All @@ -47,6 +50,9 @@ def extract_pull_requests(
repo: GitHub repository name
chunk_size: Number of PRs to yield per chunk (default: 100)
github_api_url: Optional custom GitHub API URL (for testing/mocking)
fetch_commits: Whether to fetch commit data for each PR (default: True)
fetch_reviewers: Whether to fetch reviewer data for each PR (default: True)
fetch_comments: Whether to fetch comment data for each PR (default: True)

Yields:
List of pull request dictionaries (up to chunk_size items)
Expand Down Expand Up @@ -89,15 +95,28 @@ def extract_pull_requests(
pr_number = pr.get("number")
if not pr_number:
continue
pr["commit_data"] = extract_commits(
session, repo, pr_number, github_api_url
)
pr["reviewer_data"] = extract_reviewers(
session, repo, pr_number, github_api_url
)
pr["comment_data"] = extract_comments(
session, repo, pr_number, github_api_url
)

# Conditionally fetch additional data to avoid N+1 query pattern
if fetch_commits:
pr["commit_data"] = extract_commits(
session, repo, pr_number, github_api_url
)
else:
pr["commit_data"] = []

if fetch_reviewers:
pr["reviewer_data"] = extract_reviewers(
session, repo, pr_number, github_api_url
)
else:
pr["reviewer_data"] = {"users": [], "teams": []}

if fetch_comments:
pr["comment_data"] = extract_comments(
session, repo, pr_number, github_api_url
)
else:
pr["comment_data"] = []

yield batch

Expand Down Expand Up @@ -154,7 +173,7 @@ def extract_reviewers(
repo: str,
pr_number: int,
github_api_url: Optional[str] = None,
) -> list[dict]:
) -> dict:
"""
Extract reviewers for a specific pull request.

Expand All @@ -164,7 +183,7 @@ def extract_reviewers(
pr_number: Pull request number
github_api_url: Optional custom GitHub API URL (for testing/mocking)
Returns:
List of reviewer dictionaries for the pull request
Dictionary with 'users' and 'teams' keys containing reviewer data
"""
logger = logging.getLogger(__name__)
logger.info(f"Extracting reviews for PR #{pr_number}")
Expand Down Expand Up @@ -450,18 +469,33 @@ def main() -> int:
bigquery_client = bigquery.Client(project=bigquery_project)

# Read GitHub repository configuration
github_repos = os.getenv("GITHUB_REPOS").split(",")
if not github_repos:
github_repos_env = os.getenv("GITHUB_REPOS")
if not github_repos_env:
raise SystemExit(
"Environment variable GITHUB_REPOS is required (format: 'owner/repo,owner/repo')"
)
github_repos = github_repos_env.split(",")

# Read optional data fetching configuration (defaults to True for backward compatibility)
fetch_commits = os.getenv("FETCH_COMMITS", "true").lower() in ("true", "1", "yes")
fetch_reviewers = os.getenv("FETCH_REVIEWERS", "true").lower() in ("true", "1", "yes")
fetch_comments = os.getenv("FETCH_COMMENTS", "true").lower() in ("true", "1", "yes")

if not fetch_commits:
logger.info("Commit data fetching is disabled")
if not fetch_reviewers:
logger.info("Reviewer data fetching is disabled")
if not fetch_comments:
logger.info("Comment data fetching is disabled")

total_processed = 0

for repo in github_repos:
for chunk_count, chunk in enumerate(
extract_pull_requests(
session, repo, chunk_size=100, github_api_url=github_api_url
session, repo, chunk_size=100, github_api_url=github_api_url,
fetch_commits=fetch_commits, fetch_reviewers=fetch_reviewers,
fetch_comments=fetch_comments
), start=1
):
logger.info(f"Processing chunk {chunk_count} with {len(chunk)} PRs")
Expand Down