-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjoss_license_extractor.py
More file actions
120 lines (97 loc) · 4.5 KB
/
joss_license_extractor.py
File metadata and controls
120 lines (97 loc) · 4.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import csv
import requests
import time
# --- Configuration ---
# It's highly recommended to use a GitHub Personal Access Token to avoid rate limiting.
# Unauthenticated requests are limited to 60 requests per hour.
# Create a token here: https://github.com/settings/tokens
# The token does not need any special permissions for this script.
# ONLY Works for Github. Need to fix for Not Github repo's too and need to discuss the approach for that in general.
GITHUB_TOKEN = "" # Paste your token here
CSV_FILE_PATH = "joss_repositories_20250806_080130.csv"
OUTPUT_CSV_FILE = "joss_repositories_20250806_080130_licenses.csv"
# -------------------
def get_repo_license(github_url):
"""
Fetches the license information for a given GitHub repository URL.
Args:
github_url (str): The full URL of the GitHub repository.
Returns:
str: The name of the license, or a message indicating it's not found or an error occurred.
"""
try:
# Split the URL to get the owner and repository name
parts = github_url.strip().split('/')
if len(parts) < 5 or parts[2].lower() != 'github.com':
return "Invalid GitHub URL"
owner = parts[3]
repo = parts[4]
# Construct the API URL
api_url = f"https://api.github.com/repos/{owner}/{repo}"
# Set up headers for the API request
headers = {
"Accept": "application/vnd.github.v3+json"
}
if GITHUB_TOKEN:
headers["Authorization"] = f"token {GITHUB_TOKEN}"
# Make the API request
response = requests.get(api_url, headers=headers)
# Check for rate limiting
if response.status_code == 403 and 'rate limit exceeded' in response.text.lower():
print("Rate limit exceeded. Waiting for 1 minute before retrying...")
time.sleep(60)
response = requests.get(api_url, headers=headers) # Retry once
# Raise an exception for other bad status codes (404, 500, etc.)
response.raise_for_status()
repo_data = response.json()
# Check if the license information is present
if repo_data.get("license") and repo_data["license"].get("name"):
return repo_data["license"]["name"]
else:
return "License not specified"
except requests.exceptions.HTTPError as http_err:
if http_err.response.status_code == 404:
return "Repository not found"
return f"HTTP error occurred: {http_err}"
except requests.exceptions.RequestException as req_err:
return f"Request error occurred: {req_err}"
except IndexError:
return "Could not parse owner/repo from URL"
except Exception as err:
return f"An unexpected error occurred: {err}"
def main():
"""
Main function to read the CSV, process each repository, and write results to a new CSV.
"""
print(f"Reading repositories from '{CSV_FILE_PATH}'...")
try:
with open(CSV_FILE_PATH, mode='r', newline='', encoding='utf-8') as infile, \
open(OUTPUT_CSV_FILE, mode='w', newline='', encoding='utf-8') as outfile:
# Set up CSV writer and write the header
writer = csv.writer(outfile)
writer.writerow(["Repository URL", "License"])
# Assuming the CSV has one column with the URLs and may or may not have a header
# Sniffing the dialect helps handle different CSV formats
dialect = csv.Sniffer().sniff(infile.read(1024))
infile.seek(0)
# Check if there is a header
has_header = csv.Sniffer().has_header(infile.read(1024))
infile.seek(0)
reader = csv.reader(infile, dialect)
if has_header:
next(reader) # Skip header row
for row in reader:
if row: # Ensure the row is not empty
repo_url = row[0]
print(f"Processing: {repo_url}")
license_name = get_repo_license(repo_url)
print(f"-> License: {license_name}")
# Write the result to the output file
writer.writerow([repo_url, license_name])
print(f"\nProcessing complete. Results have been saved to '{OUTPUT_CSV_FILE}'.")
except FileNotFoundError:
print(f"Error: The file '{CSV_FILE_PATH}' was not found.")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
main()