From e8836a606617a6a06ca103bc4d905aa41ae07e15 Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Tue, 7 Oct 2025 08:53:56 +0200 Subject: [PATCH] remove outdated stub scripts --- scripts/1-fetch/deviantart_fetch.py | 291 -------------- scripts/1-fetch/flickr_fetch.py | 395 ------------------- scripts/1-fetch/internetarchive_fetch.py | 252 ------------ scripts/1-fetch/metmuseum_fetch.py | 219 ---------- scripts/1-fetch/vimeo_fetch.py | 261 ------------ scripts/1-fetch/wikicommons_fetch.py | 302 -------------- scripts/1-fetch/wikipedia_fetch.py | 246 ------------ scripts/1-fetch/youtube_fetch.py | 284 ------------- scripts/2-process/deviantart_process.py | 97 ----- scripts/2-process/flickr_process.py | 97 ----- scripts/2-process/internetarchive_process.py | 97 ----- scripts/2-process/metmuseum_process.py | 97 ----- scripts/2-process/vimeo_process.py | 97 ----- scripts/2-process/wikicommons_process.py | 97 ----- scripts/2-process/wikipedia_process.py | 97 ----- scripts/2-process/youtube_process.py | 97 ----- scripts/3-report/deviantart_report.py | 195 --------- scripts/3-report/flickr_report.py | 144 ------- scripts/3-report/internetarchive_report.py | 194 --------- scripts/3-report/metmuseum_report.py | 192 --------- scripts/3-report/vimeo_report.py | 187 --------- scripts/3-report/wikicommons_report.py | 246 ------------ scripts/3-report/wikipedia_report.py | 191 --------- scripts/3-report/youtube_report.py | 191 --------- 24 files changed, 4566 deletions(-) delete mode 100755 scripts/1-fetch/deviantart_fetch.py delete mode 100755 scripts/1-fetch/flickr_fetch.py delete mode 100755 scripts/1-fetch/internetarchive_fetch.py delete mode 100755 scripts/1-fetch/metmuseum_fetch.py delete mode 100755 scripts/1-fetch/vimeo_fetch.py delete mode 100755 scripts/1-fetch/wikicommons_fetch.py delete mode 100755 scripts/1-fetch/wikipedia_fetch.py delete mode 100755 scripts/1-fetch/youtube_fetch.py delete mode 100755 scripts/2-process/deviantart_process.py delete mode 100755 scripts/2-process/flickr_process.py delete mode 100755 scripts/2-process/internetarchive_process.py delete mode 100755 scripts/2-process/metmuseum_process.py delete mode 100755 scripts/2-process/vimeo_process.py delete mode 100755 scripts/2-process/wikicommons_process.py delete mode 100755 scripts/2-process/wikipedia_process.py delete mode 100755 scripts/2-process/youtube_process.py delete mode 100755 scripts/3-report/deviantart_report.py delete mode 100755 scripts/3-report/flickr_report.py delete mode 100755 scripts/3-report/internetarchive_report.py delete mode 100755 scripts/3-report/metmuseum_report.py delete mode 100755 scripts/3-report/vimeo_report.py delete mode 100755 scripts/3-report/wikicommons_report.py delete mode 100755 scripts/3-report/wikipedia_report.py delete mode 100755 scripts/3-report/youtube_report.py diff --git a/scripts/1-fetch/deviantart_fetch.py b/scripts/1-fetch/deviantart_fetch.py deleted file mode 100755 index 3ab92fdc..00000000 --- a/scripts/1-fetch/deviantart_fetch.py +++ /dev/null @@ -1,291 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to querying data from the DeviantArt API. -""" - -# Standard library -import argparse -import csv -import os -import sys -import traceback - -# Third-party -import pandas as pd -import requests -import yaml -from dotenv import load_dotenv -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# Load environment variables -load_dotenv(PATHS["dotenv"]) - -# Global Variable for API_KEYS indexing -API_KEYS_IND = 0 - -# Gets API_KEYS and PSE_KEY from .env file -API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",") -PSE_KEY = os.getenv("PSE_KEY") - -# Log the start of the script execution -LOGGER.info("Script execution started.") - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - parser = argparse.ArgumentParser( - description="DeviantArt Data Fetching Script" - ) - parser.add_argument( - "--licenses", type=int, default=10, help="Number of licenses to query" - ) - return parser.parse_args() - - -def set_up_data_file(): - """ - Sets up the data file for recording results. - """ - LOGGER.info("Setting up the data file for recording results.") - header = "LICENSE TYPE,Document Count\n" - with open( - os.path.join(PATHS["data_phase"], "deviantart_fetched.csv"), "w" - ) as f: - f.write(header) - - -def get_license_list(): - """ - Provides the list of licenses from Creative Commons. - - Returns: - list: A list containing all license types that should be searched. - """ - LOGGER.info("Retrieving list of licenses from Creative Commons' record.") - cc_license_data = pd.read_csv( - os.path.join(PATHS["repo"], "legal-tool-paths.txt"), header=None - ) - license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*" - license_list = ( - cc_license_data[0] - .str.extract(license_pattern, expand=False) - .dropna() - .unique() - ) - return license_list - - -def get_request_url(license_type): - """ - Provides the API Endpoint URL for a specified license type. - - Args: - license_type: A string representing the type of license. - - Returns: - str: The API Endpoint URL for the query specified by parameters. - """ - LOGGER.info(f"Generating API Endpoint URL for license: {license_type}") - try: - api_key = API_KEYS[API_KEYS_IND] - return ( - "https://customsearch.googleapis.com/customsearch/v1" - f"?key={api_key}&cx={PSE_KEY}" - "&q=_&relatedSite=deviantart.com" - f'&linkSite=creativecommons.org{license_type.replace("/", "%2F")}' - ) - except IndexError: - LOGGER.error("Depleted all API Keys provided") - raise shared.QuantifyingException("No API keys left to use", 1) - - -def get_response_elems(license_type): - """ - Retrieves the number of documents for the specified license type. - - Args: - license_type: A string representing the type of license. - - Returns: - dict: A dictionary containing the total document count. - """ - LOGGER.info(f"Querying metadata for license: {license_type}") - try: - request_url = get_request_url(license_type) - max_retries = Retry( - total=5, - backoff_factor=10, - status_forcelist=[403, 408, 500, 502, 503, 504], - ) - session = requests.Session() - session.mount("https://", HTTPAdapter(max_retries=max_retries)) - with session.get(request_url) as response: - response.raise_for_status() - search_data = response.json() - return { - "totalResults": search_data["searchInformation"]["totalResults"] - } - except requests.exceptions.HTTPError as e: - global API_KEYS_IND - API_KEYS_IND += 1 - LOGGER.error(f"HTTP Error: {e}. Switching to the next API key.") - if API_KEYS_IND < len(API_KEYS): - return get_response_elems(license_type) - else: - raise shared.QuantifyingException( - f"HTTP Error: {e}. No API keys left.", 1 - ) - except requests.RequestException as e: - LOGGER.error(f"Request Exception: {e}") - raise shared.QuantifyingException(f"Request Exception: {e}", 1) - except KeyError as e: - LOGGER.error(f"KeyError: {e}.") - raise shared.QuantifyingException(f"KeyError: {e}", 1) - - -def retrieve_license_data(args): - """ - Retrieves the data of all license types specified. - - Args: - args: Parsed command-line arguments. - - Returns: - int: The total number of documents retrieved. - """ - LOGGER.info("Retrieving the data for all license types.") - licenses = get_license_list()[: args.licenses] - - # data = [] - total_docs_retrieved = 0 - - for license_type in licenses: - data_dict = get_response_elems(license_type) - total_docs_retrieved += int(data_dict["totalResults"]) - record_results(license_type, data_dict) - - return total_docs_retrieved - - -def record_results(license_type, data): - """ - Records the data for a specific license type into the CSV file. - - Args: - license_type: The license type. - data: A dictionary containing the data to record. - """ - LOGGER.info(f"Recording data for license: {license_type}") - row = [license_type, data["totalResults"]] - with open( - os.path.join(PATHS["data_phase"], "deviantart_fetched.csv"), - "a", - newline="", - ) as f: - writer = csv.writer(f, dialect="unix") - writer.writerow(row) - - -def load_state(): - """ - Loads the state from a YAML file, returns the last recorded state. - - Returns: - dict: The last recorded state. - """ - if os.path.exists(PATHS["state"]): - with open(PATHS["state"], "r") as f: - return yaml.safe_load(f) - return {"total_records_retrieved (deviantart)": 0} - - -def save_state(state: dict): - """ - Saves the state to a YAML file. - - Args: - state: The state dictionary to save. - """ - with open(PATHS["state"], "w") as f: - yaml.safe_dump(state, f) - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - state = load_state() - total_docs_retrieved = state["total_records_retrieved (deviantart)"] - LOGGER.info(f"Initial total_records_retrieved: {total_docs_retrieved}") - goal_documents = 1000 # Set goal number of documents - - if total_docs_retrieved >= goal_documents: - LOGGER.info( - f"Goal of {goal_documents} documents already achieved." - " No further action required." - ) - return - - # Log the paths being used - shared.paths_log(LOGGER, PATHS) - - # Create data directory for this phase - os.makedirs(PATHS["data_phase"], exist_ok=True) - - if total_docs_retrieved == 0: - set_up_data_file() - - # Retrieve and record data - docs_retrieved = retrieve_license_data(args) - - # Update the state with the new count of retrieved records - total_docs_retrieved += docs_retrieved - LOGGER.info( - f"Total documents retrieved after fetching: {total_docs_retrieved}" - ) - state["total_records_retrieved (deviantart)"] = total_docs_retrieved - save_state(state) - - # Add and commit changes - shared.add_and_commit( - PATHS["repo"], PATHS["data_quarter"], "Add and commit DeviantArt data" - ) - - # Push changes - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/1-fetch/flickr_fetch.py b/scripts/1-fetch/flickr_fetch.py deleted file mode 100755 index f37726aa..00000000 --- a/scripts/1-fetch/flickr_fetch.py +++ /dev/null @@ -1,395 +0,0 @@ -#!/usr/bin/env python -""" -Script to fetch photo information from Flickr API, process the data, -and save it into multiple CSV files and a JSON file. -""" - -# Standard library -import argparse -import csv -import json -import os -import sys -import time -import traceback - -# Third-party -import flickrapi -import pandas as pd -from dotenv import load_dotenv - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup paths, and LOGGER using quantify.setup() -LOGGER, PATHS = shared.setup(__file__) - -# Load environment variables -load_dotenv(PATHS["dotenv"]) - -# Global variable: Number of retries for error handling -RETRIES = 0 - -# Log the start of the script execution -LOGGER.info("Script execution started.") - -# PATHS["data_phase"], "flickr_fetched", - -# Flickr API rate limits -FLICKR_API_CALLS_PER_HOUR = 3600 -SECONDS_PER_HOUR = 3600 -API_CALL_INTERVAL = SECONDS_PER_HOUR / FLICKR_API_CALLS_PER_HOUR - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - parser = argparse.ArgumentParser(description="Google Custom Search Script") - parser.add_argument( - "--records", type=int, default=1, help="Number of records per query" - ) - parser.add_argument( - "--pages", type=int, default=1, help="Number of pages to query" - ) - parser.add_argument( - "--licenses", type=int, default=1, help="Number of licenses to query" - ) - return parser.parse_args() - - -def to_df(datalist, namelist): - """ - Transform data into a DataFrame. - - Args: - - datalist (list): List of lists containing data. - - namelist (list): List of column names. - - Returns: - - df (DataFrame): DataFrame constructed from the data. - """ - LOGGER.info("Transforming data into a DataFrame.") - df = pd.DataFrame(datalist).transpose() - df.columns = namelist - return df - - -def df_to_csv(temp_list, name_list, temp_csv, final_csv): - """ - Save data to temporary CSV and then merge it with final CSV. - - Args: - - temp_list (list): csv that is used for saving data every 100 seconds. - - name_list (list): List of column names. - - temp_csv (str): Temporary CSV file path. - - final_csv (str): Final CSV file path. - """ - LOGGER.info("Saving data to temporary CSV and merging with final CSV.") - df = to_df(temp_list, name_list) - df.to_csv(temp_csv, index=False) - # Merge temporary CSV with final CSV, ignoring index to avoid duplication - if os.path.exists(final_csv): - df_final = pd.read_csv(final_csv) - df = pd.concat([df_final, df], ignore_index=True) - df.to_csv(final_csv, index=False) - - -def creat_lisoflis(size): - """ - Create one list of list [[],[],[]] to save all the columns with - each column as a list. - - Args: - - size (int): Size of the list of lists. - - Returns: - - temp_list (list): List of empty lists. - """ - LOGGER.info("Creating list of lists for data storage.") - temp_list = [[] for _ in range(size)] - return temp_list - - -def clean_saveas_csv(old_csv_str, new_csv_str): - """ - Clean empty columns and save CSV to a new file. - - Args: - - old_csv_str (str): Path to the old CSV file. - - new_csv_str (str): Path to the new CSV file. - """ - LOGGER.info("Cleaning empty columns and saving CSV to a new file.") - data = pd.read_csv(old_csv_str, low_memory=False) - data = data.loc[:, ~data.columns.str.contains("^Unnamed")] - data.to_csv(new_csv_str, index=False) - - -def query_helper1(raw, part, detail, temp_list, index): - """ - Helper function 1 for querying data. - - Args: - - raw (dict): Raw data from API. - - part (str): Part of the data. - - detail (str): Detail to be queried. - - temp_list (list): List to store queried data. - - index (int): Index of the data in temp_list. - """ - queried_raw = raw["photo"][part][detail] - temp_list[index].append(queried_raw) - - -def query_helper2(raw, part, temp_list, index): - """ - Helper function 2 for querying data. - - Args: - - raw (dict): Raw data from API. - - part (str): Part of the data. - - temp_list (list): List to store queried data. - - index (int): Index of the data in temp_list. - """ - queried_raw = raw["photo"][part] - temp_list[index].append(queried_raw) - - -def query_data(raw_data, name_list, data_list): - """ - Query useful data from raw pulled data and store it in lists. - - Args: - - raw_data (dict): Raw data from API. - - name_list (list): List of column names. - - data_list (list): List of lists to store data. - """ - LOGGER.info( - "Querying useful data from raw pulled data and storing it in lists." - ) - for a in range(len(name_list)): - if (0 <= a < 4) or a == 9: - query_helper2(raw_data, name_list[a], data_list, a) - elif a in [4, 5]: - query_helper1(raw_data, "owner", name_list[a], data_list, a) - elif a in [6, 7, 10]: - query_helper1(raw_data, name_list[a], "_content", data_list, a) - elif a == 8: - query_helper1(raw_data, "dates", "taken", data_list, a) - if a == 11: - tags = raw_data["photo"]["tags"]["tag"] - data_list[a].append([tag["raw"] for tag in tags] if tags else []) - - -def page1_reset(final_csv, raw_data): - """ - Reset page count and update total picture count. - - Args: - - final_csv (str): Path to the final CSV file. - - raw_data (dict): Raw data from API call. - - Returns: - - int: Total number of pages. - """ - LOGGER.info("Resetting page count and updating total picture count.") - if os.path.exists(final_csv): - data = pd.read_csv(final_csv, low_memory=False) - data.drop(data.columns, axis=1, inplace=True) - data.to_csv(final_csv, index=False) - return raw_data["photos"]["pages"] - - -def handle_rate_limiting(): - """ - Handle rate limiting by pausing execution - to avoid hitting the API rate limit. - """ - LOGGER.info( - f"Sleeping for {API_CALL_INTERVAL} seconds to handle rate limiting." - ) - time.sleep(API_CALL_INTERVAL) - - -def process_data(): - final_csv_path = os.path.join( - PATHS["data_phase"], "flickr_fetched", "final.csv" - ) - record_txt_path = os.path.join( - PATHS["data_phase"], "flickr_fetched", "rec.txt" - ) - hs_csv_path = os.path.join(PATHS["data_phase"], "flickr_fetched", "hs.csv") - - # Ensure files exist - if not os.path.exists(record_txt_path): - with open(record_txt_path, "w") as f: - f.write("1 1 1") # Start from page 1, license 1, total pages 1 - - if not os.path.exists(final_csv_path): - with open(final_csv_path, "w") as f: - pass # Create an empty final.csv - - if not os.path.exists(hs_csv_path): - with open(hs_csv_path, "w") as f: - pass # Create an empty hs.csv - - flickr = flickrapi.FlickrAPI( - os.getenv("FLICKR_API_KEY"), - os.getenv("FLICKR_API_SECRET"), - format="json", - ) - license_list = [1, 2, 3, 4, 5, 6, 9, 10] - name_list = [ - "id", - "dateuploaded", - "isfavorite", - "license", - "realname", - "location", - "title", - "description", - "dates", - "views", - "comments", - "tags", - ] - temp_list = creat_lisoflis(len(name_list)) - - # Dictionary to store photo data for each Creative Commons license - photo_data_dict = {license_num: [] for license_num in license_list} - - with open(record_txt_path) as f: - readed = f.read().split(" ") - j = int(readed[0]) - i = int(readed[1]) - total = int(readed[2]) - - while i in license_list: - while j <= total: - try: - photosJson = flickr.photos.search( - license=i, per_page=100, page=j - ) - handle_rate_limiting() - photos = json.loads(photosJson.decode("utf-8")) - id_list = [x["id"] for x in photos["photos"]["photo"]] - - if j == 1: - total = page1_reset(final_csv_path, photos) - - for index in range(len(id_list)): - detailJson = flickr.photos.getInfo( - license=i, photo_id=id_list[index] - ) - handle_rate_limiting() - photos_detail = json.loads(detailJson.decode("utf-8")) - LOGGER.info( - f"{index} id out of {len(id_list)} in " - f"license {i}, page {j} out of {total}" - ) - query_data(photos_detail, name_list, temp_list) - photo_data_dict[i].append(photos_detail) - - j += 1 - LOGGER.info( - f"Page {j} out of {total} in license " - f"{i} with retry number {RETRIES}" - ) - df_to_csv(temp_list, name_list, hs_csv_path, final_csv_path) - with open(record_txt_path, "w") as f: - f.write(f"{j} {i} {total}") - temp_list = creat_lisoflis(len(name_list)) - - if j > total: - license_i_path = os.path.join( - PATHS["data_phase"], - "flickr_fetched", - f"cleaned_license{i}.csv", - ) - clean_saveas_csv(final_csv_path, license_i_path) - i += 1 - j = 1 - while i not in license_list: - i += 1 - with open(record_txt_path, "w") as f: - f.write(f"{j} {i} {total}") - temp_list = creat_lisoflis(len(name_list)) - break - - except flickrapi.exceptions.FlickrError as e: - if "rate limit" in str(e).lower(): - LOGGER.warning("Rate limit reached, sleeping for an hour.") - time.sleep(SECONDS_PER_HOUR) - continue - else: - LOGGER.error(f"Flickr API error: {e}") - raise - - # Save the dictionary containing photo data to a JSON file - with open( - os.path.join(PATHS["data_phase"], "flickr_fetched", "photos.json"), "w" - ) as json_file: - json.dump(photo_data_dict, json_file) - - -def save_license_totals(): - LOGGER.info("Saving license totals.") - license_counts = {} - for i in [1, 2, 3, 4, 5, 6, 9, 10]: - df = pd.read_csv( - os.path.join( - PATHS["data_phase"], - "flickr_fetched", - f"cleaned_license{i}.csv", - ) - ) - license_counts[i] = len(df) - - license_total_path = os.path.join( - PATHS["data_phase"], "flickr_fetched", "license_total.csv" - ) - with open(license_total_path, "w") as f: - writer = csv.writer(f, dialect="unix") - writer.writerow(["License", "Total"]) - for license, total in license_counts.items(): - writer.writerow([license, total]) - - -def main(): - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - process_data() - save_license_totals() - LOGGER.info("Script execution completed successfully.") - - # Add and commit changes - shared.add_and_commit( - PATHS["repo"], PATHS["data_quarter"], "Add and commit new reports" - ) - - # Push changes - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/1-fetch/internetarchive_fetch.py b/scripts/1-fetch/internetarchive_fetch.py deleted file mode 100755 index 92b3e591..00000000 --- a/scripts/1-fetch/internetarchive_fetch.py +++ /dev/null @@ -1,252 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to querying data from the Internet Archive API. -""" - -# Standard library -import argparse -import csv -import os -import sys -import traceback - -# Third-party -import pandas as pd -import yaml -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -# First-party/Local -from internetarchive.search import Search -from internetarchive.session import ArchiveSession - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# Log the start of the script execution -LOGGER.info("Script execution started.") - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - parser = argparse.ArgumentParser( - description="Internet Archive Data Fetching Script" - ) - parser.add_argument( - "--licenses", type=int, default=10, help="Number of licenses to query" - ) - return parser.parse_args() - - -def set_up_data_file(): - """ - Sets up the data file for recording results. - """ - LOGGER.info("Setting up the data file for recording results.") - header = "LICENSE TYPE,Document Count\n" - with open( - os.path.join(PATHS["data_phase"], "internetarchive_fetched.csv"), "w" - ) as f: - f.write(header) - - -def get_license_list(): - """ - Provides the list of licenses from a Creative Commons provided tool list. - - Returns: - list: A list containing all license types that - should be searched from Internet Archive. - """ - LOGGER.info("Retrieving list of licenses from Creative Commons' record.") - cc_license_data = pd.read_csv( - os.path.join(PATHS["repo"], "legal-tool-paths.txt"), header=None - ) - license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*" - license_list = ( - cc_license_data[0] - .str.extract(license_pattern, expand=False) - .dropna() - .unique() - ) - return license_list - - -def get_response_elems(license_type): - """ - Retrieves the number of documents for the - specified license type from Internet Archive. - - Args: - license_type: A string representing the type of license. - - Returns: - dict: A dictionary containing the total document count. - """ - LOGGER.info(f"Querying metadata for license: {license_type}") - try: - max_retries = Retry( - total=5, - backoff_factor=10, - status_forcelist=[403, 408, 429, 500, 502, 503, 504], - ) - search_session = ArchiveSession() - search_session.mount_http_adapter( - protocol="https://", - max_retries=HTTPAdapter(max_retries=max_retries), - ) - search_data = Search( - search_session, - f'/metadata/licenseurl:("http://creativecommons.org/' - f'{license_type}")', - ) - return {"totalResults": len(search_data)} - except Exception as e: - LOGGER.error(f"Error fetching data for license: {license_type}: {e}") - raise shared.QuantifyingException(f"Error fetching data: {e}", 1) - - -def retrieve_license_data(args): - """ - Retrieves the data of all license types specified. - - Args: - args: Parsed command-line arguments. - - Returns: - int: The total number of documents retrieved. - """ - LOGGER.info("Retrieving the data for all license types.") - licenses = get_license_list()[: args.licenses] - - # data = [] - total_docs_retrieved = 0 - - for license_type in licenses: - data_dict = get_response_elems(license_type) - total_docs_retrieved += int(data_dict["totalResults"]) - record_results(license_type, data_dict) - - return total_docs_retrieved - - -def record_results(license_type, data): - """ - Records the data for a specific license type into the CSV file. - - Args: - license_type: The license type. - data: A dictionary containing the data to record. - """ - LOGGER.info(f"Recording data for license: {license_type}") - row = [license_type, data["totalResults"]] - with open( - os.path.join(PATHS["data_phase"], "internetarchive_fetched.csv"), - "a", - newline="", - ) as f: - writer = csv.writer(f, dialect="unix") - writer.writerow(row) - - -def load_state(): - """ - Loads the state from a YAML file, returns the last recorded state. - - Returns: - dict: The last recorded state. - """ - if os.path.exists(PATHS["state"]): - with open(PATHS["state"], "r") as f: - return yaml.safe_load(f) - return {"total_records_retrieved (internet archive)": 0} - - -def save_state(state: dict): - """ - Saves the state to a YAML file. - - Args: - state: The state dictionary to save. - """ - with open(PATHS["state"], "w") as f: - yaml.safe_dump(state, f) - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - state = load_state() - total_docs_retrieved = state["total_records_retrieved (internet archive)"] - LOGGER.info(f"Initial total_records_retrieved: {total_docs_retrieved}") - goal_documents = 1000 # Set goal number of documents - - if total_docs_retrieved >= goal_documents: - LOGGER.info( - f"Goal of {goal_documents} documents already achieved." - " No further action required." - ) - return - - # Log the paths being used - shared.paths_log(LOGGER, PATHS) - - # Create data directory for this phase - os.makedirs(PATHS["data_phase"], exist_ok=True) - - if total_docs_retrieved == 0: - set_up_data_file() - - # Retrieve and record data - docs_retrieved = retrieve_license_data(args) - - # Update the state with the new count of retrieved records - total_docs_retrieved += docs_retrieved - LOGGER.info( - f"Total documents retrieved after fetching: {total_docs_retrieved}" - ) - state["total_records_retrieved (internet archive)"] = total_docs_retrieved - save_state(state) - - # Add and commit changes - shared.add_and_commit( - PATHS["repo"], - PATHS["data_quarter"], - "Add and commit Internet Archive data", - ) - - # Push changes - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/1-fetch/metmuseum_fetch.py b/scripts/1-fetch/metmuseum_fetch.py deleted file mode 100755 index 6c378c35..00000000 --- a/scripts/1-fetch/metmuseum_fetch.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to querying data from the MetMuseum API. -""" - -# Standard library -import argparse -import csv -import os -import sys -import traceback - -# Third-party -import requests -import yaml -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# Log the start of the script execution -LOGGER.info("Script execution started.") - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - parser = argparse.ArgumentParser( - description="MetMuseum Data Fetching Script" - ) - parser.add_argument( - "--licenses", type=int, default=1, help="Number of licenses to query" - ) - return parser.parse_args() - - -def set_up_data_file(): - """ - Sets up the data file for recording results. - """ - LOGGER.info("Setting up the data file for recording results.") - header = "LICENSE TYPE,Document Count\n" - with open( - os.path.join(PATHS["data_phase"], "metmuseum_fetched.csv"), "w" - ) as f: - f.write(header) - - -def get_request_url(): - """ - Provides the API Endpoint URL for MetMuseum data. - - Returns: - string: The API Endpoint URL for the query. - """ - LOGGER.info("Providing the API Endpoint URL for MetMuseum data.") - return "https://collectionapi.metmuseum.org/public/collection/v1/objects" - - -def get_response_elems(): - """ - Retrieves the total number of documents from the MetMuseum API. - - Returns: - dict: A dictionary containing the total document count. - """ - LOGGER.info("Querying metadata from the MetMuseum API.") - try: - request_url = get_request_url() - max_retries = Retry( - total=5, - backoff_factor=10, - status_forcelist=[403, 408, 429, 500, 502, 503, 504], - ) - session = requests.Session() - session.mount("https://", HTTPAdapter(max_retries=max_retries)) - with session.get(request_url) as response: - response.raise_for_status() - search_data = response.json() - return {"totalResults": search_data.get("total", 0)} - except Exception as e: - LOGGER.error(f"Error occurred during request: {e}") - raise shared.QuantifyingException(f"Error fetching data: {e}", 1) - - -def retrieve_license_data(): - """ - Retrieves the data for the public domain license from the MetMuseum API. - - Returns: - int: The total number of documents retrieved. - """ - LOGGER.info( - "Retrieving the data for public domain license from MetMuseum." - ) - data_dict = get_response_elems() - total_docs_retrieved = int(data_dict["totalResults"]) - record_results("publicdomain/zero/1.0", data_dict) - return total_docs_retrieved - - -def record_results(license_type, data): - """ - Records the data for a specific license type into the CSV file. - - Args: - license_type: The license type. - data: A dictionary containing the data to record. - """ - LOGGER.info(f"Recording data for license: {license_type}") - row = [license_type, data["totalResults"]] - with open( - os.path.join(PATHS["data_phase"], "metmuseum_fetched.csv"), - "a", - newline="", - ) as f: - writer = csv.writer(f, dialect="unix") - writer.writerow(row) - - -def load_state(): - """ - Loads the state from a YAML file, returns the last recorded state. - - Returns: - dict: The last recorded state. - """ - if os.path.exists(PATHS["state"]): - with open(PATHS["state"], "r") as f: - return yaml.safe_load(f) - return {"total_records_retrieved (metmuseum)": 0} - - -def save_state(state: dict): - """ - Saves the state to a YAML file. - - Args: - state: The state dictionary to save. - """ - with open(PATHS["state"], "w") as f: - yaml.safe_dump(state, f) - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - # args = parse_arguments() - - state = load_state() - total_docs_retrieved = state["total_records_retrieved (metmusuem)"] - LOGGER.info(f"Initial total_documents_retrieved: {total_docs_retrieved}") - goal_documents = 1000 # Set goal number of documents - - if total_docs_retrieved >= goal_documents: - LOGGER.info( - f"Goal of {goal_documents} documents already achieved." - " No further action required." - ) - return - - # Log the paths being used - shared.paths_log(LOGGER, PATHS) - - # Create data directory for this phase - os.makedirs(PATHS["data_phase"], exist_ok=True) - - if total_docs_retrieved == 0: - set_up_data_file() - - # Retrieve and record data - docs_retrieved = retrieve_license_data() - - # Update the state with the new count of retrieved records - total_docs_retrieved += docs_retrieved - LOGGER.info( - f"Total documents retrieved after fetching: {total_docs_retrieved}" - ) - state["total_records_retrieved (metmuseum)"] = total_docs_retrieved - save_state(state) - - # Add and commit changes - shared.add_and_commit( - PATHS["repo"], PATHS["data_quarter"], "Add and commit MetMuseum data" - ) - - # Push changes - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/1-fetch/vimeo_fetch.py b/scripts/1-fetch/vimeo_fetch.py deleted file mode 100755 index d307e62a..00000000 --- a/scripts/1-fetch/vimeo_fetch.py +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to querying data from the Vimeo API. -""" - -# Standard library -import argparse -import csv -import os -import sys -import traceback - -# Third-party -import requests -import yaml -from dotenv import load_dotenv -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# Load environment variables -load_dotenv(PATHS["dotenv"]) - -# Gets API_KEYS and PSE_KEY from .env file -ACCESS_TOKEN = os.getenv("VIMEO_ACCESS_TOKEN") -CLIENT_ID = os.getenv("VIMEO_CLIENT_ID") - -# Log the start of the script execution -LOGGER.info("Script execution started.") - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - parser = argparse.ArgumentParser(description="Vimeo Data Fetching Script") - parser.add_argument( - "--licenses", type=int, default=8, help="Number of licenses to query" - ) - return parser.parse_args() - - -def set_up_data_file(): - """ - Sets up the data file for recording results. - """ - LOGGER.info("Setting up the data file for recording results.") - header = "LICENSE TYPE,Document Count\n" - with open( - os.path.join(PATHS["data_phase"], "vimeo_fetched.csv"), "w" - ) as f: - f.write(header) - - -def get_license_list(): - """ - Provides the list of licenses to be searched via Vimeo API. - Returns: - List: A list containing all license types to be searched in Vimeo API. - """ - LOGGER.info("Providing the list of licenses to be searched in Vimeo API.") - return [ - "CC", - "CC-BY", - "CC-BY-NC", - "CC-BY-NC-ND", - "CC-BY-NC-SA", - "CC-BY-ND", - "CC-BY-SA", - "CC0", - ] - - -def get_request_url(license="CC"): - """ - Provides the API Endpoint URL for specified license combinations. - - Args: - license: A string representing the type of license. - - Returns: - string: A string representing the API Endpoint URL for the query. - """ - LOGGER.info( - "Providing the API Endpoint URL for specified parameter combinations." - ) - return ( - f"https://api.vimeo.com/videos?filter={license}" - f"&client_id={CLIENT_ID}&access_token={ACCESS_TOKEN}" - ) - - -def get_response_elems(license): - """ - Provides the metadata for query of specified license type. - - Args: - license: A string representing the type of license. - - Returns: - dict: A dictionary containing the metadata from the API query. - """ - LOGGER.info(f"Querying metadata for license: {license}") - try: - request_url = get_request_url(license=license) - max_retries = Retry( - total=5, - backoff_factor=10, - status_forcelist=[403, 408, 429, 500, 502, 503, 504], - ) - session = requests.Session() - session.mount("https://", HTTPAdapter(max_retries=max_retries)) - with session.get(request_url) as response: - response.raise_for_status() - search_data = response.json() - return {"totalResults": search_data["total"]} - except Exception as e: - LOGGER.error(f"Error occurred during request: {e}") - raise shared.QuantifyingException(f"Error fetching data: {e}", 1) - - -def retrieve_license_data(args): - """ - Retrieves the data of all license types specified. - - Returns: - int: The total number of documents retrieved. - """ - LOGGER.info("Retrieving the data for all license types from Vimeo.") - licenses = get_license_list()[: args.licenses] - - data = [] - total_docs_retrieved = 0 - - for license_type in licenses: - data_dict = get_response_elems(license_type) - total_docs_retrieved += data_dict["totalResults"] - record_results(license_type, data_dict) - - for row in data: - LOGGER.info(f"Collected data row: {row}") - - return data - - -def record_results(license_type, data): - """ - Records the data for a specific license type into the CSV file. - - Args: - license_type: The license type. - data: A dictionary containing the data to record. - """ - LOGGER.info(f"Recording data for license: {license_type}") - row = [license_type, data["totalResults"]] - with open( - os.path.join(PATHS["data_phase"], "vimeo_fetched.csv"), "a", newline="" - ) as f: - writer = csv.writer(f, dialect="unix") - writer.writerow(row) - - -def load_state(): - """ - Loads the state from a YAML file, returns the last recorded state. - - Returns: - dict: The last recorded state. - """ - if os.path.exists(PATHS["state"]): - with open(PATHS["state"], "r") as f: - return yaml.safe_load(f) - return {"total_records_retrieved (vimeo)": 0} - - -def save_state(state: dict): - """ - Saves the state to a YAML file. - - Args: - state: The state dictionary to save. - """ - with open(PATHS["state"], "w") as f: - yaml.safe_dump(state, f) - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - state = load_state() - total_docs_retrieved = state["total_records_retrieved (vimeo)"] - LOGGER.info(f"Initial total_documents_retrieved: {total_docs_retrieved}") - goal_documents = 1000 # Set goal number of documents - - if total_docs_retrieved >= goal_documents: - LOGGER.info( - f"Goal of {goal_documents} documents already achieved." - " No further action required." - ) - return - - # Log the paths being used - shared.paths_log(LOGGER, PATHS) - - # Create data directory for this phase - os.makedirs(PATHS["data_phase"], exist_ok=True) - - if total_docs_retrieved == 0: - set_up_data_file() - - # Retrieve and record data - docs_retrieved = retrieve_license_data(args) - - # Update the state with the new count of retrieved records - total_docs_retrieved += docs_retrieved - LOGGER.info( - f"Total documents retrieved after fetching: {total_docs_retrieved}" - ) - state["total_records_retrieved (vimeo)"] = total_docs_retrieved - save_state(state) - - # Add and commit changes - shared.add_and_commit( - PATHS["repo"], PATHS["data_quarter"], "Add and commit Vimeo data" - ) - - # Push changes - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py deleted file mode 100755 index ed5ea026..00000000 --- a/scripts/1-fetch/wikicommons_fetch.py +++ /dev/null @@ -1,302 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to querying data from WikiCommons. -""" - -# Standard library -import argparse -import csv -import os -import sys -import traceback - -# Third-party -import requests -import yaml -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# Log the start of the script execution -LOGGER.info("Script execution started.") - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - parser = argparse.ArgumentParser( - description="WikiCommons Data Fetching Script" - ) - parser.add_argument( - "--license_alias", - type=str, - default="Free_Creative_Commons_licenses", - help="Root category for recursive license search", - ) - return parser.parse_args() - - -def set_up_data_file(): - """ - Sets up the data file for recording results. - """ - LOGGER.info("Setting up the data file for recording results.") - header = "LICENSE TYPE,File Count,Page Count\n" - with open( - os.path.join(PATHS["data_phase"], "wikicommons_fetched.csv"), "w" - ) as f: - f.write(header) - - -def get_content_request_url(license): - """ - Provides the API Endpoint URL for - specified parameters' WikiCommons contents. - - Args: - license: A string representing the type of license. - - Returns: - string: The API Endpoint URL for the - query specified by this function's parameters. - """ - LOGGER.info(f"Generating content request URL for license: {license}") - return ( - r"https://commons.wikimedia.org/w/api.php?" - r"action=query&prop=categoryinfo&titles=" - f"Category:{license}&format=json" - ) - - -def get_subcat_request_url(license): - """ - Provides the API Endpoint URL for specified parameters' - WikiCommons subcategories for recursive searching. - - Args: - license: A string representing the type of license. - - Returns: - string: The API Endpoint URL for the query - specified by this function's parameters. - """ - LOGGER.info(f"Generating subcategory request URL for license: {license}") - base_url = ( - r"https://commons.wikimedia.org/w/api.php?" - r"action=query&cmtitle=" - f"Category:{license}" - r"&cmtype=subcat&list=categorymembers&format=json" - ) - return base_url - - -def get_subcategories(license, session): - """ - Obtain the subcategories of LICENSE in - WikiCommons Database for recursive searching. - - Args: - license: A string representing the type of license. - session: A requests.Session object for accessing API endpoints. - - Returns: - list: A list representing the subcategories - of current license type in WikiCommons dataset. - """ - LOGGER.info(f"Obtaining subcategories for license: {license}") - try: - request_url = get_subcat_request_url(license) - with session.get(request_url) as response: - response.raise_for_status() - search_data = response.json() - category_list = [ - members["title"].replace("Category:", "").replace("&", "%26") - for members in search_data["query"]["categorymembers"] - ] - return category_list - except Exception as e: - LOGGER.error(f"Error occurred during subcategory request: {e}") - raise shared.QuantifyingException( - f"Error fetching subcategories: {e}", 1 - ) - - -def get_license_contents(license, session): - """ - Provides the metadata for a query of specified parameters. - - Args: - license: A string representing the type of license. - session: A requests.Session object for accessing API endpoints. - - Returns: - dict: A dictionary mapping metadata - to its value provided from the API query. - """ - LOGGER.info(f"Querying content for license: {license}") - try: - request_url = get_content_request_url(license) - with session.get(request_url) as response: - response.raise_for_status() - search_data = response.json() - file_cnt = 0 - page_cnt = 0 - for id in search_data["query"]["pages"]: - lic_content = search_data["query"]["pages"][id] - file_cnt += lic_content["categoryinfo"]["files"] - page_cnt += lic_content["categoryinfo"]["pages"] - return {"total_file_cnt": file_cnt, "total_page_cnt": page_cnt} - except Exception as e: - LOGGER.error(f"Error occurred during content request: {e}") - raise shared.QuantifyingException(f"Error fetching content: {e}", 1) - - -def record_results(license_type, data): - """ - Records the data for a specific license type into the CSV file. - - Args: - license_type: The license type. - data: A dictionary containing the data to record. - """ - LOGGER.info(f"Recording data for license: {license_type}") - row = [license_type, data["total_file_cnt"], data["total_page_cnt"]] - with open( - os.path.join(PATHS["data_phase"], "wikicommons_fetched.csv"), - "a", - newline="", - ) as f: - writer = csv.writer(f, dialect="unix") - writer.writerow(row) - - -def recur_record_all_licenses(license_alias="Free_Creative_Commons_licenses"): - """ - Recursively records the data of all license - types findable in the license list and its individual subcategories. - - Args: - license_alias: The root category alias for recursive search. - """ - LOGGER.info("Starting recursive recording of license data.") - - license_cache = {} - session = requests.Session() - max_retries = Retry( - total=5, - backoff_factor=10, - status_forcelist=[403, 408, 429, 500, 502, 503, 504], - ) - session.mount("https://", HTTPAdapter(max_retries=max_retries)) - - def recursive_traversing_subroutine(alias): - alias.replace(",", "|") - cur_category = alias.split("/")[-1] - subcategories = get_subcategories(cur_category, session) - if cur_category not in license_cache: - license_content = get_license_contents(cur_category, session) - record_results(alias, license_content) - license_cache[cur_category] = True - for cats in subcategories: - recursive_traversing_subroutine(f"{alias}/{cats}") - - recursive_traversing_subroutine(license_alias) - - -def load_state(): - """ - Loads the state from a YAML file, returns the last recorded state. - """ - if os.path.exists(PATHS["state"]): - with open(PATHS["state"], "r") as f: - return yaml.safe_load(f) - return {"total_records_retrieved (wikicommons)": 0} - - -def save_state(state: dict): - """ - Saves the state to a YAML file. - - Args: - state: The state dictionary to save. - """ - with open(PATHS["state"], "w") as f: - yaml.safe_dump(state, f) - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - state = load_state() - total_docs_retrieved = state["total_records_retrieved (wikicommons)"] - LOGGER.info(f"Initial total_documents_retrieved: {total_docs_retrieved}") - goal_documents = 1000 # Set goal number of documents - - if total_docs_retrieved >= goal_documents: - LOGGER.info( - f"Goal of {goal_documents} documents already achieved." - " No further action required." - ) - return - - # Log the paths being used - shared.paths_log(LOGGER, PATHS) - - # Create data directory for this phase - os.makedirs(PATHS["data_phase"], exist_ok=True) - - if total_docs_retrieved == 0: - set_up_data_file() - - # Retrieve and record data - recur_record_all_licenses(args.license_alias) - - # Update the state with the new count of retrieved records - total_docs_retrieved += 1 # Update with actual number retrieved - LOGGER.info( - f"Total documents retrieved after fetching: {total_docs_retrieved}" - ) - state["total_records_retrieved (wikicommons)"] = total_docs_retrieved - save_state(state) - - # Add and commit changes - shared.add_and_commit( - PATHS["repo"], PATHS["data_quarter"], "Add and commit WikiCommons data" - ) - - # Push changes - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/1-fetch/wikipedia_fetch.py b/scripts/1-fetch/wikipedia_fetch.py deleted file mode 100755 index 57a04663..00000000 --- a/scripts/1-fetch/wikipedia_fetch.py +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to querying data from the Wikipedia API. -""" - -# Standard library -import argparse -import csv -import os -import sys -import traceback - -# Third-party -import requests -import yaml -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# Log the start of the script execution -LOGGER.info("Script execution started.") - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - parser = argparse.ArgumentParser( - description="Wikipedia Data Fetching Script" - ) - parser.add_argument( - "--languages", - type=str, - nargs="+", - default=["en"], - help="List of Wikipedia language codes to query", - ) - return parser.parse_args() - - -def set_up_data_file(): - """ - Sets up the data file for recording results. - """ - LOGGER.info("Setting up the data file for recording results.") - header = ( - "language,articles,edits,images," - "users,activeusers,admins,jobqueue,views\n" - ) - with open( - os.path.join(PATHS["data_phase"], "wikipedia_fetched.csv"), "w" - ) as f: - f.write(header) - - -def get_request_url(lang="en"): - """ - Provides the API Endpoint URL for specified parameter combinations. - - Args: - lang: A string representing the language for the Wikipedia API. - - Returns: - string: The API Endpoint URL for the query. - """ - LOGGER.info(f"Generating request URL for language: {lang}") - base_url = ( - r"https://{lang}.wikipedia.org/w/api.php" - "?action=query&meta=siteinfo" - "&siprop=statistics&format=json" - ) - return base_url.format(lang=lang) - - -def get_response_elems(language="en"): - """ - Provides the metadata for query of specified parameters. - - Args: - language: A string representing the language for the Wikipedia API. - - Returns: - dict: A dictionary mapping metadata - to its value provided from the API query. - """ - LOGGER.info(f"Querying Wikipedia API for language: {language}") - try: - request_url = get_request_url(language) - max_retries = Retry( - total=5, - backoff_factor=10, - status_forcelist=[403, 408, 429, 500, 502, 503, 504], - ) - session = requests.Session() - session.mount("https://", HTTPAdapter(max_retries=max_retries)) - with session.get(request_url) as response: - response.raise_for_status() - search_data = response.json() - stats = search_data.get("query", {}).get("statistics", {}) - stats["language"] = language - return stats - except Exception as e: - LOGGER.error(f"Error occurred during API request: {e}") - raise shared.QuantifyingException(f"Error fetching data: {e}", 1) - - -def record_results(stats): - """ - Records the data for a specific language into the CSV file. - - Args: - stats: A dictionary of Wikipedia statistics. - """ - LOGGER.info(f"Recording data for language: {stats.get('language')}") - row = [ - stats.get("language", ""), - stats.get("articles", 0), - stats.get("edits", 0), - stats.get("images", 0), - stats.get("users", 0), - stats.get("activeusers", 0), - stats.get("admins", 0), - stats.get("jobqueue", 0), - stats.get("views", 0), - ] - with open( - os.path.join(PATHS["data_phase"], "wikipedia_fetched.csv"), - "a", - newline="", - ) as f: - writer = csv.writer(f, dialect="unix") - writer.writerow(row) - - -def retrieve_and_record_data(args): - """ - Retrieves and records the data for all specified languages. - """ - LOGGER.info("Starting data retrieval and recording.") - total_records_retrieved = 0 - - for lang in args.languages: - stats = get_response_elems(lang) - if stats: - record_results(stats) - total_records_retrieved += 1 - - return total_records_retrieved - - -def load_state(): - """ - Loads the state from a YAML file, returns the last recorded state. - """ - if os.path.exists(PATHS["state"]): - with open(PATHS["state"], "r") as f: - return yaml.safe_load(f) - return {"total_records_retrieved (wikipedia)": 0} - - -def save_state(state: dict): - """ - Saves the state to a YAML file. - - Args: - state: The state dictionary to save. - """ - with open(PATHS["state"], "w") as f: - yaml.safe_dump(state, f) - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - state = load_state() - total_records_retrieved = state["total_records_retrieved (wikipedia)"] - LOGGER.info(f"Initial total_records_retrieved: {total_records_retrieved}") - goal_records = 1000 # Set goal number of records - - if total_records_retrieved >= goal_records: - LOGGER.info( - f"Goal of {goal_records} records already achieved." - " No further action required." - ) - return - - # Log the paths being used - shared.paths_log(LOGGER, PATHS) - - # Create data directory for this phase - os.makedirs(PATHS["data_phase"], exist_ok=True) - - if total_records_retrieved == 0: - set_up_data_file() - - # Retrieve and record data - records_retrieved = retrieve_and_record_data(args) - - # Update the state with the new count of retrieved records - total_records_retrieved += records_retrieved - LOGGER.info( - f"Total records retrieved after fetching: {total_records_retrieved}" - ) - state["total_records_retrieved (wikipedia)"] = total_records_retrieved - save_state(state) - - # Add and commit changes - shared.add_and_commit( - PATHS["repo"], PATHS["data_quarter"], "Add and commit Wikipedia data" - ) - - # Push changes - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/1-fetch/youtube_fetch.py b/scripts/1-fetch/youtube_fetch.py deleted file mode 100755 index 7689ab9a..00000000 --- a/scripts/1-fetch/youtube_fetch.py +++ /dev/null @@ -1,284 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to querying data from the YouTube API. -""" - -# Standard library -import argparse -import csv -import datetime -import os -import sys -import traceback - -# Third-party -import requests -import yaml -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# Load environment variables -API_KEY = os.getenv("YOUTUBE_API_KEY") - -# Log the start of the script execution -LOGGER.info("Script execution started.") - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - parser = argparse.ArgumentParser( - description="YouTube Data Fetching Script" - ) - parser.add_argument( - "--license_type", - type=str, - default="licenses/by/3.0", - help="License type to query", - ) - return parser.parse_args() - - -def set_up_data_file(): - """ - Sets up the data file for recording results. - """ - LOGGER.info("Setting up the data file for recording results.") - header = "LICENSE TYPE,Time,Document Count\n" - with open( - os.path.join(PATHS["data_phase"], "youtube_fetched.csv"), "w" - ) as f: - f.write(header) - - -def get_next_time_search_interval(): - """ - Provides the next searching interval of time - for Creative Commons licensed video. - - Yields: - tuple: A tuple representing the time search interval currently - dealt via 2 RFC 3339 formatted date-time values (by YouTube - API Standards), and current starting year/month of the interval. - """ - LOGGER.info("Generating time intervals for search.") - datetime_today = datetime.datetime.today() - cur_year, cur_month = 2009, 1 - while ( - cur_year * 100 + cur_month - <= datetime_today.year * 100 + datetime_today.month - ): - end_month, end_day = 12, 31 - if cur_month == 1: - end_month, end_day = 2, 28 + int(cur_year % 4 == 0) - elif cur_month == 3: - end_month, end_day = 4, 30 - elif cur_month == 5: - end_month, end_day = 6, 30 - elif cur_month == 7: - end_month, end_day = 8, 31 - elif cur_month == 9: - end_month, end_day = 10, 31 - elif cur_month == 11: - end_month, end_day = 12, 31 - yield ( - f"{cur_year}-{cur_month:02d}-01T00:00:00Z", - f"{cur_year}-{end_month:02d}-{end_day:02d}T23:59:59Z", - cur_year, - cur_month, - ) - cur_month += 2 - if cur_month > 12: - cur_month = 1 - cur_year += 1 - - -def get_request_url(time=None): - """ - Provides the API Endpoint URL for specified parameter combinations. - - Args: - time: A tuple indicating the time interval for the query. - - Returns: - string: The API Endpoint URL for the query. - """ - LOGGER.info("Generating request URL for time interval.") - base_url = ( - r"https://youtube.googleapis.com/youtube/v3/search?" - "part=snippet&type=video&videoLicense=creativeCommon" - ) - if time is not None: - base_url += f"&publishedAfter={time[0]}&publishedBefore={time[1]}" - return f"{base_url}&key={API_KEY}" - - -def get_response_elems(time=None): - """ - Provides the metadata for query of specified parameters. - - Args: - time: A tuple indicating the time interval for the query. - - Returns: - dict: A dictionary mapping metadata to - its value provided from the API query. - """ - LOGGER.info(f"Querying YouTube API for time interval: {time[2]}-{time[3]}") - try: - request_url = get_request_url(time=time) - max_retries = Retry( - total=5, - backoff_factor=10, - status_forcelist=[403, 408, 429, 500, 502, 503, 504], - ) - session = requests.Session() - session.mount("https://", HTTPAdapter(max_retries=max_retries)) - with session.get(request_url) as response: - response.raise_for_status() - search_data = response.json() - return search_data.get("pageInfo", {}).get("totalResults", 0) - except Exception as e: - LOGGER.error(f"Error occurred during API request: {e}") - raise shared.QuantifyingException(f"Error fetching data: {e}", 1) - - -def record_results(license_type, time, document_count): - """ - Records the data for a specific license type - and time interval into the CSV file. - - Args: - license_type: The license type. - time: The time interval. - document_count: The number of documents. - """ - LOGGER.info( - f"Recording data for license: {license_type}," - "time: {time}, count: {document_count}" - ) - row = [license_type, time, document_count] - with open( - os.path.join(PATHS["data_phase"], "youtube_fetched.csv"), - "a", - newline="", - ) as f: - writer = csv.writer(f, dialect="unix") - writer.writerow(row) - - -def retrieve_and_record_data(args): - """ - Retrieves and records the data for all license types and time intervals. - """ - LOGGER.info("Starting data retrieval and recording.") - total_documents_retrieved = 0 - - for time in get_next_time_search_interval(): - document_count = get_response_elems(time=time) - record_results( - args.license_type, f"{time[2]}-{time[3]}", document_count - ) - total_documents_retrieved += document_count - - return total_documents_retrieved - - -def load_state(): - """ - Loads the state from a YAML file, returns the last recorded state. - """ - if os.path.exists(PATHS["state"]): - with open(PATHS["state"], "r") as f: - return yaml.safe_load(f) - return {"total_records_retrieved (youtube)": 0} - - -def save_state(state: dict): - """ - Saves the state to a YAML file. - - Args: - state: The state dictionary to save. - """ - with open(PATHS["state"], "w") as f: - yaml.safe_dump(state, f) - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - state = load_state() - total_docs_retrieved = state["total_records_retrieved (youtube)"] - LOGGER.info(f"Initial total_documents_retrieved: {total_docs_retrieved}") - goal_documents = 1000 # Set goal number of documents - - if total_docs_retrieved >= goal_documents: - LOGGER.info( - f"Goal of {goal_documents} documents already achieved." - " No further action required." - ) - return - - # Log the paths being used - shared.paths_log(LOGGER, PATHS) - - # Create data directory for this phase - os.makedirs(PATHS["data_phase"], exist_ok=True) - - if total_docs_retrieved == 0: - set_up_data_file() - - # Retrieve and record data - docs_retrieved = retrieve_and_record_data(args) - - # Update the state with the new count of retrieved records - total_docs_retrieved += docs_retrieved - LOGGER.info( - f"Total documents retrieved after fetching: {total_docs_retrieved}" - ) - state["total_records_retrieved (youtube)"] = total_docs_retrieved - save_state(state) - - # Add and commit changes - shared.add_and_commit( - PATHS["repo"], PATHS["data_quarter"], "Add and commit YouTube data" - ) - - # Push changes - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/2-process/deviantart_process.py b/scripts/2-process/deviantart_process.py deleted file mode 100755 index b9e01544..00000000 --- a/scripts/2-process/deviantart_process.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to processing Deviantart data -for analysis and comparison between quarters. -""" -# Standard library -import os -import sys -import traceback - -# import pandas as pd - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# def load_quarter_data(quarter): -# """ -# Load data for a specific quarter. -# """ -# file_path = os.path.join(PATHS["data"], f"{quarter}", -# "1-fetch", "deviantart_fetched", "license_total.csv") -# if not os.path.exists(file_path): -# LOGGER.error(f"Data file for quarter {quarter} not found.") -# return None -# return pd.read_csv(file_path) - - -# def compare_data(current_quarter, previous_quarter): -# """ -# Compare data between two quarters. -# """ -# current_data = load_quarter_data(current_quarter) -# previous_data = load_quarter_data(previous_quarter) - -# if current_data is None or previous_data is None: -# return - -# Process data to compare totals - - -# def parse_arguments(): -# """ -# Parses command-line arguments, returns parsed arguments. -# """ -# LOGGER.info("Parsing command-line arguments") -# parser = argparse.ArgumentParser( -# description="Google Custom Search Comparison Report") -# parser.add_argument( -# "--current_quarter", type=str, required=True, -# help="Current quarter for comparison (e.g., 2024Q3)" -# ) -# parser.add_argument( -# "--previous_quarter", type=str, required=True, -# help="Previous quarter for comparison (e.g., 2024Q2)" -# ) -# return parser.parse_args() - - -def main(): - raise shared.QuantifyingException("No current code for Phase 2", 0) - - # # Fetch and merge changes - # shared.fetch_and_merge(PATHS["repo"]) - - # # Add and commit changes - # shared.add_and_commit( - # PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data" - # ) - - # # Push changes - # shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/2-process/flickr_process.py b/scripts/2-process/flickr_process.py deleted file mode 100755 index a8834937..00000000 --- a/scripts/2-process/flickr_process.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to processing Flickr data -for analysis and comparison between quarters. -""" -# Standard library -import os -import sys -import traceback - -# import pandas as pd - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# def load_quarter_data(quarter): -# """ -# Load data for a specific quarter. -# """ -# file_path = os.path.join(PATHS["data"], f"{quarter}", -# "1-fetch", "flickr_fetched", "license_total.csv") -# if not os.path.exists(file_path): -# LOGGER.error(f"Data file for quarter {quarter} not found.") -# return None -# return pd.read_csv(file_path) - - -# def compare_data(current_quarter, previous_quarter): -# """ -# Compare data between two quarters. -# """ -# current_data = load_quarter_data(current_quarter) -# previous_data = load_quarter_data(previous_quarter) - -# if current_data is None or previous_data is None: -# return - -# Process data to compare totals - - -# def parse_arguments(): -# """ -# Parses command-line arguments, returns parsed arguments. -# """ -# LOGGER.info("Parsing command-line arguments") -# parser = argparse.ArgumentParser( -# description="Google Custom Search Comparison Report") -# parser.add_argument( -# "--current_quarter", type=str, required=True, -# help="Current quarter for comparison (e.g., 2024Q3)" -# ) -# parser.add_argument( -# "--previous_quarter", type=str, required=True, -# help="Previous quarter for comparison (e.g., 2024Q2)" -# ) -# return parser.parse_args() - - -def main(): - raise shared.QuantifyingException("No current code for Phase 2", 0) - - # # Fetch and merge changes - # shared.fetch_and_merge(PATHS["repo"]) - - # # Add and commit changes - # shared.add_and_commit( - # PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data" - # ) - - # # Push changes - # shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/2-process/internetarchive_process.py b/scripts/2-process/internetarchive_process.py deleted file mode 100755 index 10a3e138..00000000 --- a/scripts/2-process/internetarchive_process.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to processing Internet Archive data -for analysis and comparison between quarters. -""" -# Standard library -import os -import sys -import traceback - -# import pandas as pd - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# def load_quarter_data(quarter): -# """ -# Load data for a specific quarter. -# """ -# file_path = os.path.join(PATHS["data"], f"{quarter}", -# "1-fetch", "internetarchive_fetched") -# if not os.path.exists(file_path): -# LOGGER.error(f"Data file for quarter {quarter} not found.") -# return None -# return pd.read_csv(file_path) - - -# def compare_data(current_quarter, previous_quarter): -# """ -# Compare data between two quarters. -# """ -# current_data = load_quarter_data(current_quarter) -# previous_data = load_quarter_data(previous_quarter) - -# if current_data is None or previous_data is None: -# return - -# Process data to compare totals - - -# def parse_arguments(): -# """ -# Parses command-line arguments, returns parsed arguments. -# """ -# LOGGER.info("Parsing command-line arguments") -# parser = argparse.ArgumentParser( -# description="Google Custom Search Comparison Report") -# parser.add_argument( -# "--current_quarter", type=str, required=True, -# help="Current quarter for comparison (e.g., 2024Q3)" -# ) -# parser.add_argument( -# "--previous_quarter", type=str, required=True, -# help="Previous quarter for comparison (e.g., 2024Q2)" -# ) -# return parser.parse_args() - - -def main(): - raise shared.QuantifyingException("No current code for Phase 2", 0) - - # # Fetch and merge changes - # shared.fetch_and_merge(PATHS["repo"]) - - # # Add and commit changes - # shared.add_and_commit( - # PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data" - # ) - - # # Push changes - # shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/2-process/metmuseum_process.py b/scripts/2-process/metmuseum_process.py deleted file mode 100755 index 51134fce..00000000 --- a/scripts/2-process/metmuseum_process.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to processing Met Museum data -for analysis and comparison between quarters. -""" -# Standard library -import os -import sys -import traceback - -# import pandas as pd - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# def load_quarter_data(quarter): -# """ -# Load data for a specific quarter. -# """ -# file_path = os.path.join(PATHS["data"], f"{quarter}", -# "1-fetch", "metmuseum_fetched") -# if not os.path.exists(file_path): -# LOGGER.error(f"Data file for quarter {quarter} not found.") -# return Nones -# return pd.read_csv(file_path) - - -# def compare_data(current_quarter, previous_quarter): -# """ -# Compare data between two quarters. -# """ -# current_data = load_quarter_data(current_quarter) -# previous_data = load_quarter_data(previous_quarter) - -# if current_data is None or previous_data is None: -# return - -# Process data to compare totals - - -# def parse_arguments(): -# """ -# Parses command-line arguments, returns parsed arguments. -# """ -# LOGGER.info("Parsing command-line arguments") -# parser = argparse.ArgumentParser( -# description="Google Custom Search Comparison Report") -# parser.add_argument( -# "--current_quarter", type=str, required=True, -# help="Current quarter for comparison (e.g., 2024Q3)" -# ) -# parser.add_argument( -# "--previous_quarter", type=str, required=True, -# help="Previous quarter for comparison (e.g., 2024Q2)" -# ) -# return parser.parse_args() - - -def main(): - raise shared.QuantifyingException("No current code for Phase 2", 0) - - # # Fetch and merge changes - # shared.fetch_and_merge(PATHS["repo"]) - - # # Add and commit changes - # shared.add_and_commit( - # PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data" - # ) - - # # Push changes - # shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/2-process/vimeo_process.py b/scripts/2-process/vimeo_process.py deleted file mode 100755 index 867fbbbe..00000000 --- a/scripts/2-process/vimeo_process.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to processing Vimeo data -for analysis and comparison between quarters. -""" -# Standard library -import os -import sys -import traceback - -# import pandas as pd - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# def load_quarter_data(quarter): -# """ -# Load data for a specific quarter. -# """ -# file_path = os.path.join(PATHS["data"], f"{quarter}", -# "1-fetch", "vimeo_fetched",) -# if not os.path.exists(file_path): -# LOGGER.error(f"Data file for quarter {quarter} not found.") -# return None -# return pd.read_csv(file_path) - - -# def compare_data(current_quarter, previous_quarter): -# """ -# Compare data between two quarters. -# """ -# current_data = load_quarter_data(current_quarter) -# previous_data = load_quarter_data(previous_quarter) - -# if current_data is None or previous_data is None: -# return - -# Process data to compare totals - - -# def parse_arguments(): -# """ -# Parses command-line arguments, returns parsed arguments. -# """ -# LOGGER.info("Parsing command-line arguments") -# parser = argparse.ArgumentParser( -# description="Google Custom Search Comparison Report") -# parser.add_argument( -# "--current_quarter", type=str, required=True, -# help="Current quarter for comparison (e.g., 2024Q3)" -# ) -# parser.add_argument( -# "--previous_quarter", type=str, required=True, -# help="Previous quarter for comparison (e.g., 2024Q2)" -# ) -# return parser.parse_args() - - -def main(): - raise shared.QuantifyingException("No current code for Phase 2", 0) - - # # Fetch and merge changes - # shared.fetch_and_merge(PATHS["repo"]) - - # # Add and commit changes - # shared.add_and_commit( - # PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data" - # ) - - # # Push changes - # shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/2-process/wikicommons_process.py b/scripts/2-process/wikicommons_process.py deleted file mode 100755 index 890c1b96..00000000 --- a/scripts/2-process/wikicommons_process.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to processing Wikicommons data -for analysis and comparison between quarters. -""" -# Standard library -import os -import sys -import traceback - -# import pandas as pd - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# def load_quarter_data(quarter): -# """ -# Load data for a specific quarter. -# """ -# file_path = os.path.join(PATHS["data"], f"{quarter}", -# "1-fetch", "wikicommons_fetched") -# if not os.path.exists(file_path): -# LOGGER.error(f"Data file for quarter {quarter} not found.") -# return None -# return pd.read_csv(file_path) - - -# def compare_data(current_quarter, previous_quarter): -# """ -# Compare data between two quarters. -# """ -# current_data = load_quarter_data(current_quarter) -# previous_data = load_quarter_data(previous_quarter) - -# if current_data is None or previous_data is None: -# return - -# Process data to compare totals - - -# def parse_arguments(): -# """ -# Parses command-line arguments, returns parsed arguments. -# """ -# LOGGER.info("Parsing command-line arguments") -# parser = argparse.ArgumentParser( -# description="Google Custom Search Comparison Report") -# parser.add_argument( -# "--current_quarter", type=str, required=True, -# help="Current quarter for comparison (e.g., 2024Q3)" -# ) -# parser.add_argument( -# "--previous_quarter", type=str, required=True, -# help="Previous quarter for comparison (e.g., 2024Q2)" -# ) -# return parser.parse_args() - - -def main(): - raise shared.QuantifyingException("No current code for Phase 2", 0) - - # # Fetch and merge changes - # shared.fetch_and_merge(PATHS["repo"]) - - # # Add and commit changes - # shared.add_and_commit( - # PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data" - # ) - - # # Push changes - # shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py deleted file mode 100755 index d5ea354a..00000000 --- a/scripts/2-process/wikipedia_process.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to processing Wikipedia data -for analysis and comparison between quarters. -""" -# Standard library -import os -import sys -import traceback - -# import pandas as pd - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# def load_quarter_data(quarter): -# """ -# Load data for a specific quarter. -# """ -# file_path = os.path.join(PATHS["data"], f"{quarter}", -# "1-fetch", "wikipedia_fetched") -# if not os.path.exists(file_path): -# LOGGER.error(f"Data file for quarter {quarter} not found.") -# return None -# return pd.read_csv(file_path) - - -# def compare_data(current_quarter, previous_quarter): -# """ -# Compare data between two quarters. -# """ -# current_data = load_quarter_data(current_quarter) -# previous_data = load_quarter_data(previous_quarter) - -# if current_data is None or previous_data is None: -# return - -# Process data to compare totals - - -# def parse_arguments(): -# """ -# Parses command-line arguments, returns parsed arguments. -# """ -# LOGGER.info("Parsing command-line arguments") -# parser = argparse.ArgumentParser( -# description="Google Custom Search Comparison Report") -# parser.add_argument( -# "--current_quarter", type=str, required=True, -# help="Current quarter for comparison (e.g., 2024Q3)" -# ) -# parser.add_argument( -# "--previous_quarter", type=str, required=True, -# help="Previous quarter for comparison (e.g., 2024Q2)" -# ) -# return parser.parse_args() - - -def main(): - raise shared.QuantifyingException("No current code for Phase 2", 0) - - # # Fetch and merge changes - # shared.fetch_and_merge(PATHS["repo"]) - - # # Add and commit changes - # shared.add_and_commit( - # PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data" - # ) - - # # Push changes - # shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/2-process/youtube_process.py b/scripts/2-process/youtube_process.py deleted file mode 100755 index a04520ca..00000000 --- a/scripts/2-process/youtube_process.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to processing Youtube data -for analysis and comparison between quarters. -""" -# Standard library -import os -import sys -import traceback - -# import pandas as pd - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# def load_quarter_data(quarter): -# """ -# Load data for a specific quarter. -# """ -# file_path = os.path.join(PATHS["data"], f"{quarter}", -# "1-fetch", "youtube_fetched") -# if not os.path.exists(file_path): -# LOGGER.error(f"Data file for quarter {quarter} not found.") -# return None -# return pd.read_csv(file_path) - - -# def compare_data(current_quarter, previous_quarter): -# """ -# Compare data between two quarters. -# """ -# current_data = load_quarter_data(current_quarter) -# previous_data = load_quarter_data(previous_quarter) - -# if current_data is None or previous_data is None: -# return - -# Process data to compare totals - - -# def parse_arguments(): -# """ -# Parses command-line arguments, returns parsed arguments. -# """ -# LOGGER.info("Parsing command-line arguments") -# parser = argparse.ArgumentParser( -# description="Google Custom Search Comparison Report") -# parser.add_argument( -# "--current_quarter", type=str, required=True, -# help="Current quarter for comparison (e.g., 2024Q3)" -# ) -# parser.add_argument( -# "--previous_quarter", type=str, required=True, -# help="Previous quarter for comparison (e.g., 2024Q2)" -# ) -# return parser.parse_args() - - -def main(): - raise shared.QuantifyingException("No current code for Phase 2", 0) - - # # Fetch and merge changes - # shared.fetch_and_merge(PATHS["repo"]) - - # # Add and commit changes - # shared.add_and_commit( - # PATHS["repo"], PATHS["data_quarter"], "Fetched and updated new data" - # ) - - # # Push changes - # shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/3-report/deviantart_report.py b/scripts/3-report/deviantart_report.py deleted file mode 100755 index cdcc2389..00000000 --- a/scripts/3-report/deviantart_report.py +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to visualizing and analyzing the data collected -from Deviantart. -""" -# Standard library -import argparse -import os -import sys -import traceback -from datetime import datetime, timezone - -# Third-party -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns -from pandas import PeriodIndex - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - - # Taken from shared module, fix later - datetime_today = datetime.now(timezone.utc) - quarter = PeriodIndex([datetime_today.date()], freq="Q")[0] - - parser = argparse.ArgumentParser(description="Deviantart Reports") - parser.add_argument( - "--quarter", - "-q", - type=str, - required=False, - default=f"{quarter}", - help="Data quarter in format YYYYQx, e.g., 2024Q2", - ) - parser.add_argument( - "--skip-commit", - action="store_true", - help="Don't git commit changes (also skips git push changes)", - ) - parser.add_argument( - "--skip-push", - action="store_true", - help="Don't git push changes", - ) - parser.add_argument( - "--show-plots", - action="store_true", - help="Show generated plots (in addition to saving them)", - ) - args = parser.parse_args() - if args.skip_commit: - args.skip_push = True - return args - - -def load_data(args): - """ - Load the collected data from the CSV file. - """ - selected_quarter = args.quarter - - file_path = os.path.join( - PATHS["data"], - f"{selected_quarter}", - "1-fetch", - "deviantart_fetched.csv", - ) - - if not os.path.exists(file_path): - LOGGER.error(f"Data file not found: {file_path}") - return pd.DataFrame() - - data = pd.read_csv(file_path) - LOGGER.info(f"Data loaded from {file_path}") - return data - - -def visualize_by_license_type(data, args): - """ - Create a bar chart for the number of repositories licensed by license type. - """ - LOGGER.info( - "Creating a bar chart for the number of documents by license type." - ) - - selected_quarter = args.quarter - - # Strip any leading/trailing spaces from the columns - data.columns = data.columns.str.strip() - - plt.figure(figsize=(12, 8)) - ax = sns.barplot(x=data["LICENSE TYPE"], y=data["Document Count"]) - plt.title("Number of DeviantArt Documents by License Type") - plt.xlabel("License Type") - plt.ylabel("Document Count") - plt.xticks(rotation=45, ha="right") - - # Add value numbers to the top of each bar - for p in ax.patches: - ax.annotate( - format(p.get_height(), ",.0f"), - (p.get_x() + p.get_width() / 2.0, p.get_height()), - ha="center", - va="center", - xytext=(0, 9), - textcoords="offset points", - ) - - output_directory = os.path.join( - PATHS["data"], f"{selected_quarter}", "3-report" - ) - - LOGGER.info(f"Output directory: {output_directory}") - - os.makedirs(output_directory, exist_ok=True) - image_path = os.path.join( - output_directory, "deviantart_license_report.png" - ) - plt.savefig(image_path) - - if args.show_plots: - plt.show() - - shared.update_readme( - PATHS, - image_path, - "DeviantArt", - "Number of DeviantArt Documents by License Type", - "License Type Report", - args, - ) - - LOGGER.info("Visualization by license type created.") - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - data = load_data(args) - if data.empty: - return - - current_directory = os.getcwd() - LOGGER.info(f"Current working directory: {current_directory}") - - visualize_by_license_type(data, args) - - # Add and commit changes - if not args.skip_commit: - shared.add_and_commit( - PATHS["repo"], - PATHS["data_quarter"], - "Add and commit new Deviantart reports", - ) - - # Push changes - if not args.skip_push: - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/3-report/flickr_report.py b/scripts/3-report/flickr_report.py deleted file mode 100755 index ddbe25fa..00000000 --- a/scripts/3-report/flickr_report.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to visualizing the data collected for Flickr. -""" -# Standard library -import argparse -import os -import sys -import traceback -from datetime import datetime, timezone - -# Third-party -# import matplotlib.pyplot as plt -# import matplotlib.ticker as ticker -import pandas as pd - -# import seaborn as sns -from pandas import PeriodIndex - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - - # Taken from shared module, fix later - datetime_today = datetime.now(timezone.utc) - quarter = PeriodIndex([datetime_today.date()], freq="Q")[0] - - parser = argparse.ArgumentParser(description="Flickr Report") - parser.add_argument( - "--quarter", - "-q", - type=str, - required=False, - default=f"{quarter}", - help="Data quarter in format YYYYQx, e.g., 2024Q2", - ) - parser.add_argument( - "--skip-commit", - action="store_true", - help="Don't git commit changes (also skips git push changes)", - ) - parser.add_argument( - "--skip-push", - action="store_true", - help="Don't git push changes", - ) - parser.add_argument( - "--show-plots", - action="store_true", - help="Show generated plots (in addition to saving them)", - ) - args = parser.parse_args() - if args.skip_commit: - args.skip_push = True - return args - - -def load_data(args): - """ - Load the collected data from the CSV file. - """ - selected_quarter = args.quarter - - file_path = os.path.join( - PATHS["data"], - f"{selected_quarter}", - "1-fetch", - "flickr_fetched", - "final.csv", - ) - - if not os.path.exists(file_path): - LOGGER.error(f"Data file not found: {file_path}") - return pd.DataFrame() - - data = pd.read_csv(file_path) - LOGGER.info(f"Data loaded from {file_path}") - return data - - -# Add functions for individual license graphs + word clouds + total license - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - data = load_data(args) - if data.empty: - return - - current_directory = os.getcwd() - LOGGER.info(f"Current working directory: {current_directory}") - - """ - Insert functions for Flickr - """ - - # Add and commit changes - if not args.skip_commit: - shared.add_and_commit( - PATHS["repo"], - PATHS["data_quarter"], - "Add and commit new GitHub reports", - ) - - # Push changes - if not args.skip_push: - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/3-report/internetarchive_report.py b/scripts/3-report/internetarchive_report.py deleted file mode 100755 index 98f1266a..00000000 --- a/scripts/3-report/internetarchive_report.py +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to visualizing and analyzing the data collected -from Internet Archive. -""" -# Standard library -import argparse -import os -import sys -import traceback -from datetime import datetime, timezone - -# Third-party -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns -from pandas import PeriodIndex - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - - # Taken from shared module, fix later - datetime_today = datetime.now(timezone.utc) - quarter = PeriodIndex([datetime_today.date()], freq="Q")[0] - - parser = argparse.ArgumentParser(description="Internet Archive Reports") - parser.add_argument( - "--quarter", - "-q", - type=str, - required=False, - default=f"{quarter}", - help="Data quarter in format YYYYQx, e.g., 2024Q2", - ) - parser.add_argument( - "--skip-commit", - action="store_true", - help="Don't git commit changes (also skips git push changes)", - ) - parser.add_argument( - "--skip-push", - action="store_true", - help="Don't git push changes", - ) - parser.add_argument( - "--show-plots", - action="store_true", - help="Show generated plots (in addition to saving them)", - ) - args = parser.parse_args() - if args.skip_commit: - args.skip_push = True - return args - - -def load_data(args): - """ - Load the collected data from the CSV file. - """ - selected_quarter = args.quarter - - file_path = os.path.join( - PATHS["data"], - f"{selected_quarter}", - "1-fetch", - "internetarchive_fetched.csv", - ) - - if not os.path.exists(file_path): - LOGGER.error(f"Data file not found: {file_path}") - return pd.DataFrame() - - data = pd.read_csv(file_path) - LOGGER.info(f"Data loaded from {file_path}") - return data - - -def visualize_by_license_type(data, args): - """ - Create a bar chart for the number of repositories licensed by license type. - """ - LOGGER.info( - "Creating a bar chart for the number of documents by license type." - ) - - selected_quarter = args.quarter - - # Strip any leading/trailing spaces from the columns - data.columns = data.columns.str.strip() - - plt.figure(figsize=(12, 8)) - ax = sns.barplot(x=data["LICENSE TYPE"], y=data["Document Count"]) - plt.title("Number of Internet Archive Documents by License Type") - plt.xlabel("License Type") - plt.ylabel("Document Count") - plt.xticks(rotation=45, ha="right") - - # Add value numbers to the top of each bar - for p in ax.patches: - ax.annotate( - format(p.get_height(), ",.0f"), - (p.get_x() + p.get_width() / 2.0, p.get_height()), - ha="center", - va="center", - xytext=(0, 9), - textcoords="offset points", - ) - - output_directory = os.path.join( - PATHS["data"], f"{selected_quarter}", "3-report" - ) - - LOGGER.info(f"Output directory: {output_directory}") - - os.makedirs(output_directory, exist_ok=True) - image_path = os.path.join( - output_directory, "internetarchive_license_report.png" - ) - plt.savefig(image_path) - - if args.show_plots: - plt.show() - - shared.update_readme( - PATHS, - image_path, - "Internet Archive", - "Number of Internet Archive Documents by License Type", - "License Type Report", - args, - ) - LOGGER.info("Visualization by license type created.") - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - data = load_data(args) - if data.empty: - return - - current_directory = os.getcwd() - LOGGER.info(f"Current working directory: {current_directory}") - - visualize_by_license_type(data, args) - - # Add and commit changes - if not args.skip_commit: - shared.add_and_commit( - PATHS["repo"], - PATHS["data_quarter"], - "Add and commit new Internet Archive reports", - ) - - # Push changes - if not args.skip_push: - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/3-report/metmuseum_report.py b/scripts/3-report/metmuseum_report.py deleted file mode 100755 index e308a6a6..00000000 --- a/scripts/3-report/metmuseum_report.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to visualizing and analyzing the data collected -from Metmuseum. -""" -# Standard library -import argparse -import os -import sys -import traceback -from datetime import datetime, timezone - -# Third-party -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns -from pandas import PeriodIndex - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - - # Taken from shared module, fix later - datetime_today = datetime.now(timezone.utc) - quarter = PeriodIndex([datetime_today.date()], freq="Q")[0] - - parser = argparse.ArgumentParser(description="Met Museum Reports") - parser.add_argument( - "--quarter", - "-q", - type=str, - required=False, - default=f"{quarter}", - help="Data quarter in format YYYYQx, e.g., 2024Q2", - ) - parser.add_argument( - "--skip-commit", - action="store_true", - help="Don't git commit changes (also skips git push changes)", - ) - parser.add_argument( - "--skip-push", - action="store_true", - help="Don't git push changes", - ) - parser.add_argument( - "--show-plots", - action="store_true", - help="Show generated plots (in addition to saving them)", - ) - args = parser.parse_args() - if args.skip_commit: - args.skip_push = True - return args - - -def load_data(args): - """ - Load the collected data from the CSV file. - """ - selected_quarter = args.quarter - - file_path = os.path.join( - PATHS["data"], - f"{selected_quarter}", - "1-fetch", - "metmuseum_fetched.csv", - ) - - if not os.path.exists(file_path): - LOGGER.error(f"Data file not found: {file_path}") - return pd.DataFrame() - - data = pd.read_csv(file_path) - LOGGER.info(f"Data loaded from {file_path}") - return data - - -def visualize_by_license_type(data, args): - """ - Create a bar chart for the number of repositories licensed by license type. - """ - LOGGER.info( - "Creating a bar chart for the number of documents by license type." - ) - - selected_quarter = args.quarter - - # Strip any leading/trailing spaces from the columns - data.columns = data.columns.str.strip() - - plt.figure(figsize=(12, 8)) - ax = sns.barplot(x=data["LICENSE TYPE"], y=data["Document Count"]) - plt.title("Number of MetMuseum Documents by License Type") - plt.xlabel("License Type") - plt.ylabel("Document Count") - plt.xticks(rotation=45, ha="right") - - # Add value numbers to the top of each bar - for p in ax.patches: - ax.annotate( - format(p.get_height(), ",.0f"), - (p.get_x() + p.get_width() / 2.0, p.get_height()), - ha="center", - va="center", - xytext=(0, 9), - textcoords="offset points", - ) - - output_directory = os.path.join( - PATHS["data"], f"{selected_quarter}", "3-report" - ) - - LOGGER.info(f"Output directory: {output_directory}") - - os.makedirs(output_directory, exist_ok=True) - image_path = os.path.join(output_directory, "metmuseum_license_report.png") - plt.savefig(image_path) - - if args.show_plots: - plt.show() - - shared.update_readme( - PATHS, - image_path, - "MetMuseum", - "Number of MetMuseum Documents by License Type", - "License Type Report", - args, - ) - LOGGER.info("Visualization by license type created.") - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - data = load_data(args) - if data.empty: - return - - current_directory = os.getcwd() - LOGGER.info(f"Current working directory: {current_directory}") - - visualize_by_license_type(data, args) - - # Add and commit changes - if not args.skip_commit: - shared.add_and_commit( - PATHS["repo"], - PATHS["data_quarter"], - "Add and commit new Metmuseum reports", - ) - - # Push changes - if not args.skip_push: - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/3-report/vimeo_report.py b/scripts/3-report/vimeo_report.py deleted file mode 100755 index c7dc6875..00000000 --- a/scripts/3-report/vimeo_report.py +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to visualizing and analyzing the data collected -from Vimeo. -""" -# Standard library -import argparse -import os -import sys -import traceback -from datetime import datetime, timezone - -# Third-party -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns -from pandas import PeriodIndex - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - # Taken from shared module, fix later - datetime_today = datetime.now(timezone.utc) - quarter = PeriodIndex([datetime_today.date()], freq="Q")[0] - - parser = argparse.ArgumentParser(description="Vimeo Data Report") - parser.add_argument( - "--quarter", - "-q", - type=str, - default=f"{quarter}", - help="Data quarter in format YYYYQx, e.g., 2024Q2", - ) - parser.add_argument( - "--skip-commit", - action="store_true", - help="Don't git commit changes (also skips git push changes)", - ) - parser.add_argument( - "--skip-push", - action="store_true", - help="Don't git push changes", - ) - parser.add_argument( - "--show-plots", - action="store_true", - help="Show generated plots (in addition to saving them)", - ) - args = parser.parse_args() - if args.skip_commit: - args.skip_push = True - return args - - -def load_data(args): - """ - Load the collected data from the CSV file. - """ - selected_quarter = args.quarter - - file_path = os.path.join( - PATHS["data"], f"{selected_quarter}", "1-fetch", "vimeo_fetched.csv" - ) - - if not os.path.exists(file_path): - LOGGER.error(f"Data file not found: {file_path}") - return pd.DataFrame() - - data = pd.read_csv(file_path) - LOGGER.info(f"Data loaded from {file_path}") - return data - - -def visualize_by_license_type(data, args): - """ - Create a bar chart for the number of documents by license type. - """ - LOGGER.info( - "Creating a bar chart for the number of documents by license type." - ) - - selected_quarter = args.quarter - - # Strip any leading/trailing spaces from the columns - data.columns = data.columns.str.strip() - - plt.figure(figsize=(12, 8)) - ax = sns.barplot(x=data["LICENSE TYPE"], y=data["Document Count"]) - plt.title("Number of Vimeo Documents by License Type") - plt.xlabel("License Type") - plt.ylabel("Document Count") - plt.xticks(rotation=45, ha="right") - - # Add value numbers to the top of each bar - for p in ax.patches: - ax.annotate( - format(p.get_height(), ",.0f"), - (p.get_x() + p.get_width() / 2.0, p.get_height()), - ha="center", - va="center", - xytext=(0, 9), - textcoords="offset points", - ) - - output_directory = os.path.join( - PATHS["data"], f"{selected_quarter}", "3-report" - ) - - LOGGER.info(f"Output directory: {output_directory}") - - os.makedirs(output_directory, exist_ok=True) - image_path = os.path.join(output_directory, "vimeo_license_report.png") - plt.savefig(image_path) - - if args.show_plots: - plt.show() - - shared.update_readme( - PATHS, - image_path, - "Vimeo", - "Number of Vimeo Documents by License Type", - "License Type Report", - args, - ) - LOGGER.info("Visualization by license type created.") - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - data = load_data(args) - if data.empty: - return - - current_directory = os.getcwd() - LOGGER.info(f"Current working directory: {current_directory}") - - visualize_by_license_type(data, args) - - # Add and commit changes - if not args.skip_commit: - shared.add_and_commit( - PATHS["repo"], - PATHS["data_quarter"], - "Add and commit new Vimeo reports", - ) - - # Push changes - if not args.skip_push: - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/3-report/wikicommons_report.py b/scripts/3-report/wikicommons_report.py deleted file mode 100755 index e5ffba3b..00000000 --- a/scripts/3-report/wikicommons_report.py +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to visualizing and analyzing the data collected -from Wikicommons. -""" -# Standard library -import argparse -import os -import sys -import traceback -from datetime import datetime, timezone - -# Third-party -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns -from pandas import PeriodIndex - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - # Taken from shared module, fix later - datetime_today = datetime.now(timezone.utc) - quarter = PeriodIndex([datetime_today.date()], freq="Q")[0] - - parser = argparse.ArgumentParser(description="Wikicommons Data Report") - parser.add_argument( - "--quarter", - "-q", - type=str, - default=f"{quarter}", - help="Data quarter in format YYYYQx, e.g., 2024Q2", - ) - parser.add_argument( - "--skip-commit", - action="store_true", - help="Don't git commit changes (also skips git push changes)", - ) - parser.add_argument( - "--skip-push", - action="store_true", - help="Don't git push changes", - ) - parser.add_argument( - "--show-plots", - action="store_true", - help="Show generated plots (in addition to saving them)", - ) - args = parser.parse_args() - if args.skip_commit: - args.skip_push = True - return args - - -def load_data(args): - """ - Load the collected data from the CSV file. - """ - selected_quarter = args.quarter - - file_path = os.path.join( - PATHS["data"], - f"{selected_quarter}", - "1-fetch", - "wikicommons_fetched.csv", - ) - - if not os.path.exists(file_path): - LOGGER.error(f"Data file not found: {file_path}") - return pd.DataFrame() - - data = pd.read_csv(file_path) - LOGGER.info(f"Data loaded from {file_path}") - return data - - -def visualize_by_file_count(data, args): - """ - Create a bar chart for the number of files by license type. - """ - LOGGER.info( - "Creating a bar chart for the number of files by license type." - ) - - selected_quarter = args.quarter - - # Strip any leading/trailing spaces from the columns - data.columns = data.columns.str.strip() - - plt.figure(figsize=(12, 8)) - ax = sns.barplot(x=data["LICENSE TYPE"], y=data["File Count"]) - plt.title("Number of WikiCommons Files by License Type") - plt.xlabel("License Type") - plt.ylabel("File Count") - plt.xticks(rotation=45, ha="right") - - # Add value numbers to the top of each bar - for p in ax.patches: - ax.annotate( - format(p.get_height(), ",.0f"), - (p.get_x() + p.get_width() / 2.0, p.get_height()), - ha="center", - va="center", - xytext=(0, 9), - textcoords="offset points", - ) - - output_directory = os.path.join( - PATHS["data"], f"{selected_quarter}", "3-report" - ) - - LOGGER.info(f"Output directory: {output_directory}") - - os.makedirs(output_directory, exist_ok=True) - image_path = os.path.join(output_directory, "wikicommons_file_report.png") - plt.savefig(image_path) - - if args.show_plots: - plt.show() - - shared.update_readme( - PATHS, - image_path, - "WikiCommons", - "Number of WikiCommons Files by License Type", - "File Count Report", - args, - ) - LOGGER.info("Visualization by file count created.") - - -def visualize_by_page_count(data, args): - """ - Create a bar chart for the number of pages by license type. - """ - LOGGER.info( - "Creating a bar chart for the number of pages by license type." - ) - - selected_quarter = args.quarter - - # Strip any leading/trailing spaces from the columns - data.columns = data.columns.str.strip() - - plt.figure(figsize=(12, 8)) - ax = sns.barplot(x=data["LICENSE TYPE"], y=data["Page Count"]) - plt.title("Number of WikiCommons Pages by License Type") - plt.xlabel("License Type") - plt.ylabel("Page Count") - plt.xticks(rotation=45, ha="right") - - # Add value numbers to the top of each bar - for p in ax.patches: - ax.annotate( - format(p.get_height(), ",.0f"), - (p.get_x() + p.get_width() / 2.0, p.get_height()), - ha="center", - va="center", - xytext=(0, 9), - textcoords="offset points", - ) - - output_directory = os.path.join( - PATHS["data"], f"{selected_quarter}", "3-report" - ) - - LOGGER.info(f"Output directory: {output_directory}") - - os.makedirs(output_directory, exist_ok=True) - image_path = os.path.join(output_directory, "wikicommons_page_report.png") - plt.savefig(image_path) - - if args.show_plots: - plt.show() - - shared.update_readme( - PATHS, - image_path, - "WikiCommons", - "Number of WikiCommons Pages by License Type", - "Page Count Report", - args, - ) - LOGGER.info("Visualization by page count created.") - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - data = load_data(args) - if data.empty: - return - - current_directory = os.getcwd() - LOGGER.info(f"Current working directory: {current_directory}") - - visualize_by_file_count(data, args) - visualize_by_page_count(data, args) - - # Add and commit changes - if not args.skip_commit: - shared.add_and_commit( - PATHS["repo"], - PATHS["data_quarter"], - "Add and commit new WikiCommons reports", - ) - - # Push changes - if not args.skip_push: - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/3-report/wikipedia_report.py b/scripts/3-report/wikipedia_report.py deleted file mode 100755 index 56302473..00000000 --- a/scripts/3-report/wikipedia_report.py +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to visualizing and analyzing the data collected -from Wikipedia. -""" -# Standard library -import argparse -import os -import sys -import traceback -from datetime import datetime, timezone - -# Third-party -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns -from pandas import PeriodIndex - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - # Taken from shared module, fix later - datetime_today = datetime.now(timezone.utc) - quarter = PeriodIndex([datetime_today.date()], freq="Q")[0] - - parser = argparse.ArgumentParser(description="Wikipedia Data Report") - parser.add_argument( - "--quarter", - "-q", - type=str, - default=f"{quarter}", - help="Data quarter in format YYYYQx, e.g., 2024Q2", - ) - parser.add_argument( - "--skip-commit", - action="store_true", - help="Don't git commit changes (also skips git push changes)", - ) - parser.add_argument( - "--skip-push", - action="store_true", - help="Don't git push changes", - ) - parser.add_argument( - "--show-plots", - action="store_true", - help="Show generated plots (in addition to saving them)", - ) - args = parser.parse_args() - if args.skip_commit: - args.skip_push = True - return args - - -def load_data(args): - """ - Load the collected data from the CSV file. - """ - selected_quarter = args.quarter - - file_path = os.path.join( - PATHS["data"], - f"{selected_quarter}", - "1-fetch", - "wikipedia_fetched.csv", - ) - - if not os.path.exists(file_path): - LOGGER.error(f"Data file not found: {file_path}") - return pd.DataFrame() - - data = pd.read_csv(file_path) - LOGGER.info(f"Data loaded from {file_path}") - return data - - -def visualize_by_language(data, args): - """ - Create a bar chart for various statistics by language. - """ - LOGGER.info("Creating bar charts for various statistics by language.") - - selected_quarter = args.quarter - - # Strip any leading/trailing spaces from the columns - data.columns = data.columns.str.strip() - - columns_to_plot = ["pages", "articles", "edits", "images", "users"] - for column in columns_to_plot: - plt.figure(figsize=(12, 8)) - ax = sns.barplot(x="language", y=column, data=data) - plt.title(f"Wikipedia {column.capitalize()} by Language") - plt.xlabel("Language") - plt.ylabel(column.capitalize()) - plt.xticks(rotation=45, ha="right") - - # Add value numbers to the top of each bar - for p in ax.patches: - ax.annotate( - format(p.get_height(), ",.0f"), - (p.get_x() + p.get_width() / 2.0, p.get_height()), - ha="center", - va="center", - xytext=(0, 9), - textcoords="offset points", - ) - - output_directory = os.path.join( - PATHS["data"], f"{selected_quarter}", "3-report" - ) - - LOGGER.info(f"Output directory: {output_directory}") - os.makedirs(output_directory, exist_ok=True) - image_path = os.path.join( - output_directory, f"wikipedia_{column}_report.png" - ) - plt.savefig(image_path) - - if args.show_plots: - plt.show() - - shared.update_readme( - PATHS, - image_path, - "Wikipedia", - f"Wikipedia {column.capitalize()} by Language", - f"{column.capitalize()} Report", - args, - ) - LOGGER.info(f"Visualization by {column} created.") - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - data = load_data(args) - if data.empty: - return - - current_directory = os.getcwd() - LOGGER.info(f"Current working directory: {current_directory}") - - visualize_by_language(data, args) - - # Add and commit changes - if not args.skip_commit: - shared.add_and_commit( - PATHS["repo"], - PATHS["data_quarter"], - "Add and commit new Wikpedia reports", - ) - - # Push changes - if not args.skip_push: - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/3-report/youtube_report.py b/scripts/3-report/youtube_report.py deleted file mode 100755 index af73fed8..00000000 --- a/scripts/3-report/youtube_report.py +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to visualizing and analyzing the data collected -from YouTube. -""" -# Standard library -import argparse -import os -import sys -import traceback -from datetime import datetime, timezone - -# Third-party -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns -from pandas import PeriodIndex - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - # Taken from shared module, fix later - datetime_today = datetime.now(timezone.utc) - quarter = PeriodIndex([datetime_today.date()], freq="Q")[0] - - parser = argparse.ArgumentParser(description="YouTube Data Report") - parser.add_argument( - "--quarter", - "-q", - type=str, - default=f"{quarter}", - help="Data quarter in format YYYYQx, e.g., 2024Q2", - ) - parser.add_argument( - "--skip-commit", - action="store_true", - help="Don't git commit changes (also skips git push changes)", - ) - parser.add_argument( - "--skip-push", - action="store_true", - help="Don't git push changes", - ) - parser.add_argument( - "--show-plots", - action="store_true", - help="Show generated plots (in addition to saving them)", - ) - args = parser.parse_args() - if args.skip_commit: - args.skip_push = True - return args - - -def load_data(args): - """ - Load the collected data from the CSV file. - """ - selected_quarter = args.quarter - - file_path = os.path.join( - PATHS["data"], f"{selected_quarter}", "1-fetch", "youtube_fetched.csv" - ) - - if not os.path.exists(file_path): - LOGGER.error(f"Data file not found: {file_path}") - return pd.DataFrame() - - data = pd.read_csv(file_path) - LOGGER.info(f"Data loaded from {file_path}") - return data - - -def visualize_by_license_type_over_time(data, args): - """ - Create a line chart for document count over time by license type. - """ - LOGGER.info( - "Creating a line chart for document count over time by license type." - ) - - selected_quarter = args.quarter - - # Strip any leading/trailing spaces from the columns - data.columns = data.columns.str.strip() - - plt.figure(figsize=(12, 8)) - ax = sns.lineplot( - x="Time", y="Document Count", hue="LICENSE TYPE", data=data - ) - plt.title("YouTube Document Count Over Time by License Type") - plt.xlabel("Time") - plt.ylabel("Document Count") - plt.xticks(rotation=45, ha="right") - - # Add value numbers to the top of each bar - for p in ax.patches: - ax.annotate( - format(p.get_height(), ",.0f"), - (p.get_x() + p.get_width() / 2.0, p.get_height()), - ha="center", - va="center", - xytext=(0, 9), - textcoords="offset points", - ) - - output_directory = os.path.join( - PATHS["data"], f"{selected_quarter}", "3-report" - ) - - LOGGER.info(f"Output directory: {output_directory}") - - os.makedirs(output_directory, exist_ok=True) - image_path = os.path.join( - output_directory, "youtube_license_over_time_report.png" - ) - plt.savefig(image_path) - - if args.show_plots: - plt.show() - - shared.update_readme( - PATHS, - image_path, - "YouTube", - "YouTube Document Count Over Time by License Type", - "License Over Time Report", - args, - ) - LOGGER.info("Visualization by license type over time created.") - - -def main(): - - # Fetch and merge changes - shared.fetch_and_merge(PATHS["repo"]) - - args = parse_arguments() - - data = load_data(args) - if data.empty: - return - - current_directory = os.getcwd() - LOGGER.info(f"Current working directory: {current_directory}") - - visualize_by_license_type_over_time(data, args) - - # Add and commit changes - if not args.skip_commit: - shared.add_and_commit( - PATHS["repo"], - PATHS["data_quarter"], - "Add and commit new YouTube reports", - ) - - # Push changes - if not args.skip_push: - shared.push_changes(PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1)