diff --git a/scripts/2-process/openverse_process.py b/scripts/2-process/openverse_process.py new file mode 100644 index 00000000..e660b7b8 --- /dev/null +++ b/scripts/2-process/openverse_process.py @@ -0,0 +1,277 @@ +#!/usr/bin/env python +""" +This file is dedicated to processing Openverse data +for analysis and comparison between quarters. +""" +# Standard library +import argparse +import csv +import os +import sys +import traceback +from collections import defaultdict + +# Third-party +import pandas as pd + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +QUARTER = os.path.basename(PATHS["data_quarter"]) + + +def parse_arguments(): + """ + Parse command-line options, returns parsed argument namespace. + """ + LOGGER.info("Parsing command-line options") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--quarter", + default=QUARTER, + help=f"Data quarter in format YYYYQx (default: {QUARTER})", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results (default: False)", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions such as fetch, merge, add, commit, and push" + " (default: False)", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.quarter != QUARTER: + global PATHS + PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + args.logger = LOGGER + args.paths = PATHS + return args + + +def check_for_data_file(file_path): + if os.path.exists(file_path): + raise shared.QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) + + +def data_to_csv(args, data, file_path): + if not args.enable_save: + return + os.makedirs(PATHS["data_phase"], exist_ok=True) + # emulate csv.unix_dialect + data.to_csv( + file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n" + ) + + +def process_totals_by_license(args, count_data): + """ + Processing count data: totals by license + """ + LOGGER.info(process_totals_by_license.__doc__.strip()) + data = defaultdict(int) + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + count = int(row.MEDIA_COUNT) + + data[tool] += count + data = pd.DataFrame(data.items(), columns=["License", "Count"]) + data.sort_values("License", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_license.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def process_totals_by_media_type(args, count_data): + """ + Processing count data: totals by media type + """ + + LOGGER.info(process_totals_by_media_type.__doc__.strip()) + data = defaultdict(int) + + for row in count_data.itertuples(index=False): + media_type = str(row.MEDIA_TYPE) + count = int(row.MEDIA_COUNT) + + data[media_type] += count + data = pd.DataFrame(data.items(), columns=["Media_type", "Count"]) + data.sort_values("Media_type", ascending=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_media_type.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def process_totals_by_source(args, count_data): + """ + Processing count data: totals by source + """ + LOGGER.info(process_totals_by_source.__doc__.strip()) + data = defaultdict(int) + for row in count_data.itertuples(index=False): + source = str(row.SOURCE) + count = int(row.MEDIA_COUNT) + + data[source] += count + data = pd.DataFrame(data.items(), columns=["Source", "Count"]) + data.sort_values("Source", ascending=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_source.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def process_permissive_by_media_type(args, count_data): + """ + Processing count data: permissive by media type + """ + LOGGER.info(process_permissive_by_media_type.__doc__.strip()) + + data = defaultdict(int) + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + media_type = str(row.MEDIA_TYPE) + count = int(row.MEDIA_COUNT) + + if tool in ["CC0", "CC BY", "CC BY-SA"]: + data[media_type] += count + + data = pd.DataFrame(data.items(), columns=["Media_type", "Count"]) + data.sort_values("Media_type", ascending=True, inplace=True) + + file_path = shared.path_join( + PATHS["data_phase"], "openverse_permissive_by_media_type.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def process_permissive_by_source(args, count_data): + """ + Processing count data: permissive content by source + """ + LOGGER.info(process_permissive_by_source.__doc__.strip()) + data = defaultdict(int) + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + source = str(row.SOURCE) + count = int(row.MEDIA_COUNT) + if tool in ["CC0", "CC BY", "CC BY-SA"]: + data[source] += count + data = pd.DataFrame(data.items(), columns=["Source", "Count"]) + data.sort_values("Source", ascending=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "openverse_permissive_by_source.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def process_totals_by_restriction(args, count_data): + """ + Processing count data: totals by restriction + """ + # https://creativecommons.org/public-domain/freeworks/ + LOGGER.info(process_totals_by_restriction.__doc__.strip()) + + data = { + "Copyleft": 0, + "Permissive": 0, + "Public domain": 0, + "Restricted": 0, + } + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + count = int(row.MEDIA_COUNT) + + if tool in ["CC0", "PDM"]: + key = "Public domain" + + elif tool in ["CC BY"]: + key = "Permissive" + + elif tool in ["CC BY-SA"]: + key = "Copyleft" + + else: + key = "Restricted" + + data[key] += count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + data.sort_values("Category", ascending=True, inplace=True) + + file_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_restriction.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def main(): + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + + file_count = shared.path_join(PATHS["data_1-fetch"], "openverse_fetch.csv") + count_data = shared.open_data_file( + LOGGER, + file_count, + usecols=["SOURCE", "MEDIA_TYPE", "TOOL_IDENTIFIER", "MEDIA_COUNT"], + ) + process_totals_by_license(args, count_data) + process_totals_by_media_type(args, count_data) + process_totals_by_source(args, count_data) + process_permissive_by_media_type(args, count_data) + process_permissive_by_source(args, count_data) + process_totals_by_restriction(args, count_data) + # Push changes + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit new GitHub data for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.code) + except SystemExit as e: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") + sys.exit(1) diff --git a/scripts/3-report/openverse_report.py b/scripts/3-report/openverse_report.py new file mode 100644 index 00000000..acdae9c1 --- /dev/null +++ b/scripts/3-report/openverse_report.py @@ -0,0 +1,417 @@ +#!/usr/bin/env python +""" +This file is dedicated to visualizing and analyzing the data collected +from Openverse. +""" +# Standard library +import argparse +import os +import sys +import textwrap +import traceback + +# Third-party +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import plot # noqa: E402 +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) +QUARTER = os.path.basename(PATHS["data_quarter"]) +SECTION = "Openverse data" + + +def parse_arguments(): + """ + Parses command-line arguments, returns parsed arguments. + """ + LOGGER.info("Parsing command-line arguments") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--quarter", + default=QUARTER, + help=f"Data quarter in format YYYYQx (default: {QUARTER})", + ) + parser.add_argument( + "--show-plots", + action="store_true", + help="Show generated plots (default: False)", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results (default: False)", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions such as fetch, merge, add, commit, and push" + " (default: False)", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.quarter != QUARTER: + global PATHS + PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + args.logger = LOGGER + args.paths = PATHS + return args + + +def openverse_intro(args): + """ + Write Openverse Introduction. + """ + LOGGER.info(openverse_intro.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_1-fetch"], + "openverse_fetch.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "TOOL_IDENTIFIER" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + total = data["MEDIA_COUNT"].sum() + media_counts = data.groupby("MEDIA_TYPE")["MEDIA_COUNT"].sum() + total_media = media_counts.sum() + audio_percentage = ( + f"{(media_counts.get('audio', 0) / total_media) * 100:.2f}" + ) + images_percentage = ( + f"{(media_counts.get('images', 0) / total_media) * 100:.2f}" + ) + unique_sources = data["SOURCE"].nunique() + shared.update_readme( + args, + SECTION, + "Overview", + None, + None, + "The Openverse data, below, uses the `Media_count field`" + " returned by API for search queries of the various legal tools." + "\n" + f" The results indicate that there are {total} count of audio" + " and images that are licensed or put in the" + " public domain using a Creative Commons (CC) legal tool." + " They respectively take a percentage of" + f" {audio_percentage} and {images_percentage}," + " of the total media count returned by the Openverse API." + "\n" + f"There are {unique_sources} count of" + f" data sources under the openverse API.\n" + "\n" + "Thank you Openverse for providing a public API" + " access to its media metadata!", + ) + + +def plot_totals_by_license_type(args): + """ + Create plots showing totals by license type + """ + LOGGER.info(plot_totals_by_license_type.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_totals_by_license.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "License" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(data_label, ascending=True, inplace=True) + title = "Totals by license type" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_license_type.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing Creative Commons (CC) legal tool totals and" + " percentages.", + ) + + +def plot_totals_by_media_type(args): + """ + Create plots showing totals by media type + """ + LOGGER.info(plot_totals_by_media_type.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_totals_by_media_type.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Media_type" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + title = "Totals by media_type" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_media_type.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing Creative Commons (CC) legal tool" + " totals by each media type", + ) + + +def plot_totals_by_sources(args): + """ + Create plots showing totals by sources + """ + LOGGER.info(plot_totals_by_sources.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_totals_by_sources.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Source" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + top_10 = data.head(10) + title = "Totals by sources" + plt = plot.combined_plot( + args=args, + data=top_10, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join(PATHS["data_phase"], "openverse_sources.png") + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing Creative Commons (CC) legal tool totals" + " across the top 10 sources returned by openverse API.", + ) + + +def plot_permissive_by_media_type(args): + """ + Create plots showing the count of permissive content by media type + """ + LOGGER.info(plot_permissive_by_media_type.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_permissive_by_media_type.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Media_type" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + title = "Permissive content by media type" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "openverse_permissive_by_media_type.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing count of permissive content by media type.", + ) + + +def plot_permissive_by_source(args): + """ + Create plots showing count of permissive content by source + """ + LOGGER.info(plot_permissive_by_source.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_permissive_by_source.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Source" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(name_label, ascending=True, inplace=True) + top_10 = data.head(10) + title = "Permissive by source" + plt = plot.combined_plot( + args=args, + data=top_10, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "openverse_permissive_by_source.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing count of permissive content" + " by top 10 sources in openverse.", + ) + + +def plot_totals_by_restriction(args): + """ + Create plots showing totals by restriction + """ + LOGGER.info(plot_totals_by_restriction.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_totals_by_restriction.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Category" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + title = "Totals by restriction" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "openverse_restriction.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing totals by different levels of rights reserved" + " on openverse media contents." + " This shows the distribution of Public domain," + " Permissive, Copyleft and restricted" + " licenses used in Openverse media contents.", + ) + + +def main(): + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + openverse_intro(args) + plot_totals_by_license_type(args) + plot_totals_by_media_type(args) + plot_permissive_by_media_type(args) + plot_permissive_by_source(args) + plot_totals_by_restriction(args) + + # Add and commit changes + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit Openverse reports for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") + sys.exit(1)