From fe29ac75fb8fa65b7cd01b661931368c82849d39 Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Thu, 19 Feb 2026 11:39:20 +0100 Subject: [PATCH 1/2] Arxiv process and report --- scripts/2-process/arxiv_process.py | 531 +++++++++++++++++++ scripts/3-report/arxiv_report.py | 808 +++++++++++++++++++++++++++++ scripts/plot.py | 112 +++- 3 files changed, 1450 insertions(+), 1 deletion(-) create mode 100644 scripts/2-process/arxiv_process.py create mode 100644 scripts/3-report/arxiv_report.py diff --git a/scripts/2-process/arxiv_process.py b/scripts/2-process/arxiv_process.py new file mode 100644 index 00000000..58fc4569 --- /dev/null +++ b/scripts/2-process/arxiv_process.py @@ -0,0 +1,531 @@ +#!/usr/bin/env python +""" +This file is dedicated to processing Arxiv data +for analysis and comparison between quarters. +""" + +# Standard library +import argparse +import os +import sys +import traceback + +# Third-party +import pandas as pd + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +QUARTER = os.path.basename(PATHS["data_quarter"]) +FILE_PATHS = [ + shared.path_join(PATHS["data_phase"], "github_totals_by_license.csv"), + shared.path_join(PATHS["data_phase"], "github_totals_by_restriction.csv"), +] + + +def parse_arguments(): + """ + Parse command-line options, returns parsed argument namespace. + """ + global QUARTER + LOGGER.info("Parsing command-line options") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--quarter", + default=QUARTER, + help=f"Data quarter in format YYYYQx (default: {QUARTER})", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results (default: False)", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions such as fetch, merge, add, commit, and push" + " (default: False)", + ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if processed files already exist", + ) + + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.quarter != QUARTER: + global FILE_PATHS, PATHS + FILE_PATHS = shared.paths_list_update( + LOGGER, FILE_PATHS, QUARTER, args.quarter + ) + PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + QUARTER = args.quarter + args.logger = LOGGER + args.paths = PATHS + return args + + +def process_totals_by_license(args, count_data): + """ + Processing count data: totals by license + """ + LOGGER.info(process_totals_by_license.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + count = int(row.COUNT) + + data[tool] = count + + data = pd.DataFrame(data.items(), columns=["License", "Count"]) + data.sort_values("License", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_totals_by_license.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_totals_by_author_bucket(args, count_data): + """ + Processing count data: totals by author_bucket + """ + LOGGER.info(process_totals_by_author_bucket.__doc__.strip()) + data = count_data.pivot_table( + index="AUTHOR_BUCKET", + columns="TOOL_IDENTIFIER", + values="COUNT", + fill_value=0, + ).reset_index() + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_totals_by_author_bucket.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_3_by_year(args, count_data): + """ + Processing count data: CC BY 3.0 by year + """ + LOGGER.info(process_cc_by_3_by_year.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + year = str(row.YEAR) + count = int(row.COUNT) + + if tool != "CC BY 3.0": + continue + data[year] = count + + data = pd.DataFrame(data.items(), columns=["Year", "Count"]) + data.sort_values("Year", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_3_by_year.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_4_by_year(args, count_data): + """ + Processing count data: CC BY 4.0 by year + """ + LOGGER.info(process_cc_by_4_by_year.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + year = str(row.YEAR) + count = int(row.COUNT) + + if tool != "CC BY 4.0": + continue + data[year] = count + + data = pd.DataFrame(data.items(), columns=["Year", "Count"]) + data.sort_values("Year", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_4_by_year.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_nc_nd_4_by_year(args, count_data): + """ + Processing count data: CC BY-NC-ND 4.0 by year + """ + LOGGER.info(process_cc_by_nc_nd_4_by_year.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + year = str(row.YEAR) + count = int(row.COUNT) + + if tool != "CC BY-NC-ND 4.0": + continue + data[year] = count + + data = pd.DataFrame(data.items(), columns=["Year", "Count"]) + data.sort_values("Year", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_nd_4_by_year.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_nc_sa_3_by_year(args, count_data): + """ + Processing count data: CC BY-NC-SA 3.0 by year + """ + LOGGER.info(process_cc_by_nc_sa_3_by_year.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + year = str(row.YEAR) + count = int(row.COUNT) + if tool != "CC BY-NC-SA 3.0": + continue + data[year] = count + + data = pd.DataFrame(data.items(), columns=["Year", "Count"]) + data.sort_values("Year", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_sa_3_by_year.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_nc_sa_4_by_year(args, count_data): + """ + Processing count data: CC BY-NC-SA 4.0 by year + """ + LOGGER.info(process_cc_by_nc_sa_4_by_year.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + year = str(row.YEAR) + count = int(row.COUNT) + if tool != "CC BY-NC-SA 4.0": + continue + data[year] = count + + data = pd.DataFrame(data.items(), columns=["Year", "Count"]) + data.sort_values("Year", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_sa_4_by_year.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_sa_4_by_year(args, count_data): + """ + Processing count data: CC BY-SA 4.0 by year + """ + LOGGER.info(process_cc_by_sa_4_by_year.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + year = str(row.YEAR) + count = int(row.COUNT) + if tool != "CC BY-SA 4.0": + continue + data[year] = count + + data = pd.DataFrame(data.items(), columns=["Year", "Count"]) + data.sort_values("Year", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_sa_4_by_year.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc0_1_by_year(args, count_data): + """ + Processing count data: CC0 1.0 by year + """ + LOGGER.info(process_cc0_1_by_year.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + year = str(row.YEAR) + count = int(row.COUNT) + if tool != "CC0 1.0": + continue + data[year] = count + + data = pd.DataFrame(data.items(), columns=["Year", "Count"]) + data.sort_values("Year", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc0_1_by_year.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_3_by_category(args, count_data): + """ + Processing count data: CC BY 3.0 by category + """ + LOGGER.info(process_cc_by_3_by_category.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + category = str(row.CATEGORY_NAME) + count = int(row.COUNT) + if tool != "CC BY 3.0": + continue + data[category] = count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + data.sort_values("Category", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_3_by_category.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_4_by_category(args, count_data): + """ + Processing count data: CC BY 4.0 by category + """ + LOGGER.info(process_cc_by_4_by_category.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + category = str(row.CATEGORY_NAME) + count = int(row.COUNT) + if tool != "CC BY 4.0": + continue + data[category] = count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + data.sort_values("Category", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_4_by_category.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_nc_nd_4_by_category(args, count_data): + """ + Processing count data: CC BY-NC-ND 4.0 by category + """ + LOGGER.info(process_cc_by_nc_nd_4_by_category.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + category = str(row.CATEGORY_NAME) + count = int(row.COUNT) + if tool != "CC BY-NC-ND 4.0": + continue + data[category] = count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + data.sort_values("Category", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_nd_4_by_category.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_nc_sa_3_by_category(args, count_data): + """ + Processing count data: CC BY-NC-SA 3.0 by category + """ + LOGGER.info(process_cc_by_nc_sa_3_by_category.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + category = str(row.CATEGORY_NAME) + count = int(row.COUNT) + if tool != "CC BY-NC-SA 3.0": + continue + data[category] = count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + data.sort_values("Category", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_sa_3_by_category.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_nc_sa_4_by_category(args, count_data): + """ + Processing count data: CC BY-NC-SA 4.0 by category + """ + LOGGER.info(process_cc_by_nc_sa_4_by_category.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + category = str(row.CATEGORY_NAME) + count = int(row.COUNT) + if tool != "CC BY-NC-SA 4.0": + continue + data[category] = count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + data.sort_values("Category", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_sa_4_by_category.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc_by_sa_4_by_category(args, count_data): + """ + Processing count data: CC BY-SA 4.0 by category + """ + LOGGER.info(process_cc_by_sa_4_by_category.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + category = str(row.CATEGORY_NAME) + count = int(row.COUNT) + if tool != "CC BY-SA 4.0": + continue + data[category] = count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + data.sort_values("Category", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_sa_4_by_category.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def process_cc0_1_by_category(args, count_data): + """ + Processing count data: CC0 1.0 by category + """ + LOGGER.info(process_cc0_1_by_category.__doc__.strip()) + data = {} + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + category = str(row.CATEGORY_NAME) + count = int(row.COUNT) + if tool != "CC0 1.0": + continue + data[category] = count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + data.sort_values("Category", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc0_1_by_category.csv" + ) + shared.data_to_csv(args, data, file_path) + + +def main(): + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + shared.check_completion_file_exists(args, FILE_PATHS) + file_count = shared.path_join(PATHS["data_1-fetch"], "arxiv_1_count.csv") + file_category = shared.path_join( + PATHS["data_1-fetch"], "arxiv_2_count_by_category_report.csv" + ) + file_year = shared.path_join( + PATHS["data_1-fetch"], "arxiv_3_count_by_year.csv" + ) + file_author_bucket = shared.path_join( + PATHS["data_1-fetch"], "arxiv_4_count_by_author_bucket.csv" + ) + count_data = shared.open_data_file( + LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"] + ) + category_data = shared.open_data_file( + LOGGER, + file_category, + usecols=["TOOL_IDENTIFIER", "CATEGORY_NAME", "COUNT"], + ) + year_data = shared.open_data_file( + LOGGER, file_year, usecols=["TOOL_IDENTIFIER", "YEAR", "COUNT"] + ) + author_bucket_data = shared.open_data_file( + LOGGER, + file_author_bucket, + usecols=["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"], + ) + + process_totals_by_license(args, count_data) + process_totals_by_author_bucket(args, author_bucket_data) + process_cc_by_3_by_year(args, year_data) + process_cc_by_4_by_year(args, year_data) + process_cc_by_nc_nd_4_by_year(args, year_data) + process_cc_by_nc_sa_3_by_year(args, year_data) + process_cc_by_nc_sa_4_by_year(args, year_data) + process_cc_by_sa_4_by_year(args, year_data) + process_cc0_1_by_year(args, year_data) + process_cc_by_3_by_category(args, category_data) + process_cc_by_4_by_category(args, category_data) + process_cc_by_nc_nd_4_by_category(args, category_data) + process_cc_by_nc_sa_3_by_category(args, category_data) + process_cc_by_nc_sa_4_by_category(args, category_data) + process_cc_by_sa_4_by_category(args, category_data) + process_cc0_1_by_category(args, category_data) + + # Push changes + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit new GitHub data for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") + sys.exit(1) diff --git a/scripts/3-report/arxiv_report.py b/scripts/3-report/arxiv_report.py new file mode 100644 index 00000000..98c954f9 --- /dev/null +++ b/scripts/3-report/arxiv_report.py @@ -0,0 +1,808 @@ +#!/usr/bin/env python +""" +This file is dedicated to visualizing and analyzing the data collected +from Arxiv. +""" + +# Standard library +import argparse +import os +import sys +import textwrap +import traceback +from pathlib import Path + +# Third-party +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import plot # noqa: E402 +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) +QUARTER = os.path.basename(PATHS["data_quarter"]) +SECTION_FILE = Path(__file__).name +SECTION_TITLE = "Arxiv" + + +def parse_arguments(): + """ + Parses command-line arguments, returns parsed arguments. + """ + global QUARTER + LOGGER.info("Parsing command-line arguments") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--quarter", + default=QUARTER, + help=f"Data quarter in format YYYYQx (default: {QUARTER})", + ) + parser.add_argument( + "--show-plots", + action="store_true", + help="Show generated plots (default: False)", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results (default: False)", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions such as fetch, merge, add, commit, and push" + " (default: False)", + ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if report files exist", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.quarter != QUARTER: + global PATHS + PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + QUARTER = args.quarter + args.logger = LOGGER + args.paths = PATHS + return args + + +def arxiv_intro(args): + """ + Write Arxiv introduction. + """ + LOGGER.info(arxiv_intro.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_1-fetch"], + "arxiv_1_count.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + # name_label = "TOOL_IDENTIFIER" + # data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + "Overview", + None, + None, + "Coming soon", + ) + + +def plot_totals_by_license_type(args): + """ + Create plots showing totals by license type + """ + LOGGER.info(plot_totals_by_license_type.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_totals_by_license.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "License" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(data_label, ascending=True, inplace=True) + title = "Totals by license type" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_totals_by_license_type.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Plots showing totals by license type.", + ) + + +def plot_cc_by_3_by_year(args): + """ + Create line plot showing CC BY 3.0 by year + """ + LOGGER.info(plot_cc_by_3_by_year.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_3_by_year.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + data = shared.open_data_file(LOGGER, file_path, index_col="Year") + title = "CC BY 3.0 by year" + plt = plot.line_plot( + args=args, + data=data, + title=title, + xlabel="Year", + ylabel="Number of works", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_3_by_year.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Line plot showing CC BY 3.0 works by year.", + ) + + +def plot_cc_by_4_by_year(args): + """ + Create line plot showing CC BY 4.0 by year + """ + LOGGER.info(plot_cc_by_4_by_year.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_4_by_year.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + data = shared.open_data_file(LOGGER, file_path, index_col="Year") + title = "CC BY 4.0 by year" + plt = plot.line_plot( + args=args, + data=data, + title=title, + xlabel="Year", + ylabel="Number of works", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_4_by_year.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Line plot showing CC BY 4.0 works by year.", + ) + + +def plot_cc_by_nc_nd_4_by_year(args): + """ + Create line plot showing CC BY-NC-ND 4.0 by year + """ + LOGGER.info(plot_cc_by_nc_nd_4_by_year.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_nc_nd_4_by_year.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + data = shared.open_data_file(LOGGER, file_path, index_col="Year") + title = "CC BY-NC-ND 4.0 by year" + plt = plot.line_plot( + args=args, + data=data, + title=title, + xlabel="Year", + ylabel="Number of works", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_nd_4_by_year.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Line plot showing CC BY-NC-ND 4.0 works by year.", + ) + + +def plot_cc_by_nc_sa_3_by_year(args): + """ + Create line plot showing CC BY-NC-SA 3.0 by year + """ + LOGGER.info(plot_cc_by_nc_sa_3_by_year.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_nc_sa_3_by_year.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + data = shared.open_data_file(LOGGER, file_path, index_col="Year") + title = "CC BY-NC-SA 3.0 by year" + plt = plot.line_plot( + args=args, + data=data, + title=title, + xlabel="Year", + ylabel="Number of works", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_sa_3_by_year.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Line plot showing CC BY-NC-SA 3.0 works by year.", + ) + + +def plot_cc_by_nc_sa_4_by_year(args): + """ + Create line plot showing CC BY-NC-SA 4.0 by year + """ + LOGGER.info(plot_cc_by_nc_sa_4_by_year.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_nc_sa_4_by_year.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + data = shared.open_data_file(LOGGER, file_path, index_col="Year") + title = "CC BY-NC-SA 4.0 by year" + plt = plot.line_plot( + args=args, + data=data, + title=title, + xlabel="Year", + ylabel="Number of works", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_sa_4_by_year.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Line plot showing CC BY-NC-SA 4.0 works by year.", + ) + + +def plot_cc_by_sa_4_by_year(args): + """ + Create line plot showing CC BY-SA 4.0 by year + """ + LOGGER.info(plot_cc_by_sa_4_by_year.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_sa_4_by_year.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + data = shared.open_data_file(LOGGER, file_path, index_col="Year") + title = "CC BY-SA 4.0 by year" + plt = plot.line_plot( + args=args, + data=data, + title=title, + xlabel="Year", + ylabel="Number of works", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_sa_4_by_year.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Line plot showing CC BY-SA 4.0 works by year.", + ) + + +def plot_cc0_1_by_year(args): + """ + Create line plot showing CC0 1.0 by year + """ + LOGGER.info(plot_cc0_1_by_year.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc0_1_by_year.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + data = shared.open_data_file(LOGGER, file_path, index_col="Year") + title = "CC0 1.0 legal tool by year" + plt = plot.line_plot( + args=args, + data=data, + title=title, + xlabel="Year", + ylabel="Number of works", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc0_1_by_year.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Line plot showing CC0 1.0 works by year.", + ) + + +def plot_totals_by_author_bucket(args): + """ + Create stacked vertical bar plot showing totals by author bucket + """ + LOGGER.info(plot_totals_by_author_bucket.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_totals_by_author_bucket.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + data = shared.open_data_file(LOGGER, file_path, index_col="AUTHOR_BUCKET") + stack_labels = list(data.columns) + title = "Totals by author bucket" + plt = plot.stacked_barv_plot( + args=args, + data=data, + title=title, + name_label="Author Bucket", + stack_labels=stack_labels, + xlabel="Author Bucket", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_totals_by_author_bucket.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Stacked bar plot showing Arxiv works by author bucket," + " broken down by license type.", + ) + + +def plot_cc_by_3_by_category(args): + """ + Create plots showing CC BY 3.0 by category + """ + LOGGER.info(plot_cc_by_3_by_category.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_3_by_category.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Category" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(data_label, ascending=True, inplace=True) + data = data.tail(10) + title = "CC BY 3.0 by category" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_3_by_category.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Plots showing CC BY 3.0 totals by category.", + ) + + +def plot_cc_by_4_by_category(args): + """ + Create plots showing CC BY 4.0 by category + """ + LOGGER.info(plot_cc_by_4_by_category.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_4_by_category.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Category" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(data_label, ascending=True, inplace=True) + data = data.tail(10) + title = "CC BY 4.0 by category" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_4_by_category.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Plots showing CC BY 4.0 totals by category.", + ) + + +def plot_cc_by_nc_nd_4_by_category(args): + """ + Create plots showing CC BY-NC-ND 4.0 by category + """ + LOGGER.info(plot_cc_by_nc_nd_4_by_category.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_nc_nd_4_by_category.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Category" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(data_label, ascending=True, inplace=True) + data = data.tail(10) + title = "CC BY-NC-ND 4.0 by category" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_nd_4_by_category.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Plots showing CC BY-NC-ND 4.0 totals by category.", + ) + + +def plot_cc_by_nc_sa_3_by_category(args): + """ + Create plots showing CC BY-NC-SA 3.0 by category + """ + LOGGER.info(plot_cc_by_nc_sa_3_by_category.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_nc_sa_3_by_category.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Category" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(data_label, ascending=True, inplace=True) + data = data.tail(10) + title = "CC BY-NC-SA 3.0 by category" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_sa_3_by_category.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Plots showing CC BY-NC-SA 3.0 totals by category.", + ) + + +def plot_cc_by_nc_sa_4_by_category(args): + """ + Create plots showing CC BY-NC-SA 4.0 by category + """ + LOGGER.info(plot_cc_by_nc_sa_4_by_category.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_nc_sa_4_by_category.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Category" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(data_label, ascending=True, inplace=True) + data = data.tail(10) + title = "CC BY-NC-SA 4.0 by category" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_nc_sa_4_by_category.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Plots showing CC BY-NC-SA 4.0 totals by category.", + ) + + +def plot_cc_by_sa_4_by_category(args): + """ + Create plots showing CC BY-SA 4.0 by category + """ + LOGGER.info(plot_cc_by_sa_4_by_category.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc_by_sa_4_by_category.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Category" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(data_label, ascending=True, inplace=True) + data = data.tail(10) + title = "CC BY-SA 4.0 by category" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc_by_sa_4_by_category.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Plots showing CC BY-SA 4.0 totals by category.", + ) + + +def plot_cc0_1_by_category(args): + """ + Create plots showing CC0 1.0 by category + """ + LOGGER.info(plot_cc0_1_by_category.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "arxiv_cc0_1_by_category.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Category" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(data_label, ascending=True, inplace=True) + data = data.tail(10) + title = "CC0 1.0 by category" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "arxiv_cc0_1_by_category.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION_FILE, + SECTION_TITLE, + title, + image_path, + "Plots showing CC0 1.0 totals by category.", + ) + + +def main(): + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + last_entry = shared.path_join( + PATHS["data_phase"], "github_restriction.png" + ) + shared.check_completion_file_exists(args, last_entry) + arxiv_intro(args) + plot_totals_by_license_type(args) + plot_cc_by_3_by_year(args) + plot_cc_by_4_by_year(args) + plot_cc_by_nc_nd_4_by_year(args) + plot_cc_by_nc_sa_3_by_year(args) + plot_cc_by_nc_sa_4_by_year(args) + plot_cc_by_sa_4_by_year(args) + plot_cc0_1_by_year(args) + plot_totals_by_author_bucket(args) + plot_cc_by_3_by_category(args) + plot_cc_by_4_by_category(args) + plot_cc_by_nc_nd_4_by_category(args) + plot_cc_by_nc_sa_3_by_category(args) + plot_cc_by_nc_sa_4_by_category(args) + plot_cc_by_sa_4_by_category(args) + plot_cc0_1_by_category(args) + + # Add and commit changes + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit GitHub reports for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") + sys.exit(1) diff --git a/scripts/plot.py b/scripts/plot.py index fcbdb817..064a328f 100644 --- a/scripts/plot.py +++ b/scripts/plot.py @@ -73,7 +73,7 @@ def combined_plot( height = 2.5 fig, (ax1, ax2) = plt.subplots( - 1, 2, figsize=(8, height), width_ratios=(2, 1), layout="constrained" + 1, 2, figsize=(12, height), width_ratios=(2, 1), layout="constrained" ) colors = colormaps["tab10"].colors @@ -128,6 +128,116 @@ def combined_plot( return plt +def line_plot(args, data, title, xlabel=None, ylabel=None): + plt.rcParams.update({"font.family": "monospace", "figure.dpi": 300}) + + fig, ax = plt.subplots(figsize=(12, 5), layout="constrained") + colors = colormaps["tab10"].colors + + for i, col in enumerate(data.columns): + ax.plot( + data.index, + data[col], + color=colors[i % len(colors)], + label=col, + ) + + ax.set_title(title) + if xlabel: + ax.set_xlabel(xlabel) + if ylabel: + ax.set_ylabel(ylabel) + ax.yaxis.set_major_formatter(ticker.FuncFormatter(number_formatter)) + ax.tick_params(axis="x", which="major", labelrotation=45) + ax.legend(fontsize="small") + ax.grid(True, alpha=0.3) + + plt.annotate( + f"Creative Commons (CC)\ndata from {args.quarter}", + (0.95, 5), + xycoords=("figure fraction", "figure points"), + color="gray", + fontsize="x-small", + horizontalalignment="right", + ) + + if args.show_plots: + plt.show() + + return plt + + +def stacked_barv_plot( + args, + data, + title, + name_label, + stack_labels, + yscale="linear", + xlabel=None, +): + """ + Create a stacked vertical bar plot. + """ + if len(data) > 10: + raise shared.QuantifyingException( + "stacked_barv_plot() is limited to a maximum of 10 data points" + ) + + plt.rcParams.update({"font.family": "monospace", "figure.dpi": 300}) + + fig, ax = plt.subplots(figsize=(12, 5), layout="constrained") + colors = colormaps["tab10"].colors + bottom = [0] * len(data) + + for i, label in enumerate(stack_labels): + ax.bar( + data.index, + data[label], + bottom=bottom, + color=colors[i % len(colors)], + label=label, + log=(yscale == "log"), + ) + bottom = [ + current_bottom + height + for current_bottom, height in zip(bottom, data[label]) + ] + + ax.set_ylabel("Number of works") + ax.yaxis.set_major_formatter(ticker.FuncFormatter(number_formatter)) + ax.tick_params(axis="x", which="major", labelrotation=45) + + if xlabel: + ax.set_xlabel(xlabel) + else: + ax.set_xlabel(name_label) + + ax.legend( + title="Type", + fontsize="x-small", + title_fontsize="x-small", + loc="upper left", + bbox_to_anchor=(1.02, 1), + ) + + plt.suptitle(title) + plt.annotate( + f"Creative Commons (CC)\nbar y scale: {yscale}, data from" + f" {args.quarter}", + (0.95, 5), + xycoords=("figure fraction", "figure points"), + color="gray", + fontsize="x-small", + horizontalalignment="right", + ) + + if args.show_plots: + plt.show() + + return plt + + def number_formatter(x, pos): """ Use the millions formatter for x-axis From ec1bff8980b306d64a5a9d74e4dbd7912670c109 Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Thu, 19 Feb 2026 12:11:22 +0100 Subject: [PATCH 2/2] changed mode of scripts --- scripts/2-process/arxiv_process.py | 0 scripts/3-report/arxiv_report.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/2-process/arxiv_process.py mode change 100644 => 100755 scripts/3-report/arxiv_report.py diff --git a/scripts/2-process/arxiv_process.py b/scripts/2-process/arxiv_process.py old mode 100644 new mode 100755 diff --git a/scripts/3-report/arxiv_report.py b/scripts/3-report/arxiv_report.py old mode 100644 new mode 100755