Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions scripts/1-fetch/wikipedia_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,18 @@ def parse_arguments():
return args


def check_for_completion():
try:
with open(FILE_LANGUAGES, "r", newline="") as file_obj:
reader = csv.DictReader(file_obj, dialect="unix")
if len(list(reader)) > 300:
raise shared.QuantifyingException(
f"Data fetch completed for {QUARTER}", 0
)
except FileNotFoundError:
pass # File may not be found without --enable-save, etc.


def write_data(args, tool_data):
if not args.enable_save:
return args
Expand Down Expand Up @@ -157,6 +169,7 @@ def query_wikipedia_languages(session):
def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
check_for_completion()
shared.git_fetch_and_merge(args, PATHS["repo"])
session = shared.get_session()
tool_data = query_wikipedia_languages(session)
Expand Down
53 changes: 9 additions & 44 deletions scripts/2-process/github_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import traceback

# Third-party
# import pandas as pd
import pandas as pd

# Add parent directory so shared can be imported
Expand Down Expand Up @@ -60,6 +59,13 @@ def parse_arguments():
return args


def check_for_data_file(file_path):
if os.path.exists(file_path):
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)


def data_to_csv(args, data, file_path):
if not args.enable_save:
return
Expand Down Expand Up @@ -92,6 +98,7 @@ def process_totals_by_license(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "github_totals_by_license.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


Expand Down Expand Up @@ -126,52 +133,10 @@ def process_totals_by_restriction(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "github_totals_by_restriction.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


# def load_quarter_data(quarter):
# """
# Load data for a specific quarter.
# """
# file_path = os.path.join(PATHS["data"], f"{quarter}",
# "1-fetch", "github_fetched")
# if not os.path.exists(file_path):
# LOGGER.error(f"Data file for quarter {quarter} not found.")
# return None
# return pd.read_csv(file_path)


# def compare_data(current_quarter, previous_quarter):
# """
# Compare data between two quarters.
# """
# current_data = load_quarter_data(current_quarter)
# previous_data = load_quarter_data(previous_quarter)

# if current_data is None or previous_data is None:
# return

# Process data to compare totals


# def parse_arguments():
# """
# Parses command-line arguments, returns parsed arguments.
# """
# LOGGER.info("Parsing command-line arguments")
# parser = argparse.ArgumentParser(
# description="Google Custom Search Comparison Report")
# parser.add_argument(
# "--current_quarter", type=str, required=True,
# help="Current quarter for comparison (e.g., 2024Q3)"
# )
# parser.add_argument(
# "--previous_quarter", type=str, required=True,
# help="Previous quarter for comparison (e.g., 2024Q2)"
# )
# return parser.parse_args()


def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
Expand Down
276 changes: 276 additions & 0 deletions scripts/2-process/openverse_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
#!/usr/bin/env python
"""
This file is dedicated to processing Openverse data
for analysis and comparison between quarters.
"""
# Standard library
import argparse
import csv
import os
import sys
import traceback
from collections import defaultdict

# Third-party
import pandas as pd

# Add parent directory so shared can be imported
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

# First-party/Local
import shared # noqa: E402

# Setup
LOGGER, PATHS = shared.setup(__file__)

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])


def parse_arguments():
"""
Parse command-line options, returns parsed argument namespace.
"""
LOGGER.info("Parsing command-line options")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--quarter",
default=QUARTER,
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
)
parser.add_argument(
"--enable-save",
action="store_true",
help="Enable saving results (default: False)",
)
parser.add_argument(
"--enable-git",
action="store_true",
help="Enable git actions such as fetch, merge, add, commit, and push"
" (default: False)",
)
args = parser.parse_args()
if not args.enable_save and args.enable_git:
parser.error("--enable-git requires --enable-save")
if args.quarter != QUARTER:
global PATHS
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
args.logger = LOGGER
args.paths = PATHS
return args


def check_for_data_file(file_path):
if os.path.exists(file_path):
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)


def data_to_csv(args, data, file_path):
if not args.enable_save:
return
os.makedirs(PATHS["data_phase"], exist_ok=True)
# emulate csv.unix_dialect
data.to_csv(
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
)


def process_totals_by_license(args, count_data):
"""
Processing count data: totals by license
"""
LOGGER.info(process_totals_by_license.__doc__.strip())
data = defaultdict(int)

for row in count_data.itertuples(index=False):
tool = str(row.TOOL_IDENTIFIER)
count = int(row.MEDIA_COUNT)

data[tool] += count
data = pd.DataFrame(data.items(), columns=["License", "Count"])
data.sort_values("License", ascending=True, inplace=True)
data.reset_index(drop=True, inplace=True)
file_path = shared.path_join(
PATHS["data_phase"], "openverse_totals_by_license.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def process_totals_by_media_type(args, count_data):
"""
Processing count data: totals by media type
"""
# https://creativecommons.org/public-domain/freeworks/
LOGGER.info(process_totals_by_media_type.__doc__.strip())
data = defaultdict(int)

for row in count_data.itertuples(index=False):
media_type = str(row.MEDIA_TYPE)
count = int(row.MEDIA_COUNT)

data[media_type] += count
data = pd.DataFrame(data.items(), columns=["Media_type", "Count"])
data.sort_values("Media_type", ascending=True, inplace=True)
file_path = shared.path_join(
PATHS["data_phase"], "openverse_totals_by_media_type.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def process_totals_by_source(args, count_data):
"""
Processing count data: totals by source
"""
LOGGER.info(process_totals_by_source.__doc__.strip())
data = defaultdict(int)
for row in count_data.itertuples(index=False):
source = str(row.SOURCE)
count = int(row.MEDIA_COUNT)

data[source] += count
data = pd.DataFrame(data.items(), columns=["Source", "Count"])
data.sort_values("Source", ascending=True, inplace=True)
file_path = shared.path_join(
PATHS["data_phase"], "openverse_totals_by_source.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def process_permissive_by_media_type(args, count_data):
"""
Processing count data: permissive by media type
"""
LOGGER.info(process_permissive_by_media_type.__doc__.strip())

data = defaultdict(int)

for row in count_data.itertuples(index=False):
tool = str(row.TOOL_IDENTIFIER)
media_type = str(row.MEDIA_TYPE)
count = int(row.MEDIA_COUNT)

if tool in ["CC0", "CC BY", "CC BY-SA"]:
data[media_type] += count

data = pd.DataFrame(data.items(), columns=["Media_type", "Count"])
data.sort_values("Media_type", ascending=True, inplace=True)

file_path = shared.path_join(
PATHS["data_phase"], "openverse_permissive_by_media_type.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def process_permissive_by_source(args, count_data):
"""
Processing count data: permissive content by source
"""
LOGGER.info(process_permissive_by_source.__doc__.strip())
data = defaultdict(int)
for row in count_data.itertuples(index=False):
tool = str(row.TOOL_IDENTIFIER)
source = str(row.SOURCE)
count = int(row.MEDIA_COUNT)
if tool in ["CC0", "CC BY", "CC BY-SA"]:
data[source] += count
data = pd.DataFrame(data.items(), columns=["Source", "Count"])
data.sort_values("Source", ascending=True, inplace=True)
file_path = shared.path_join(
PATHS["data_phase"], "openverse_permissive_by_source.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def process_totals_by_restriction(args, count_data):
"""
Processing count data: totals by restriction
"""
LOGGER.info(process_totals_by_restriction.__doc__.strip())

data = {
"Copyleft": 0,
"Permissive": 0,
"Public domain": 0,
"Restricted": 0,
}

for row in count_data.itertuples(index=False):
tool = str(row.TOOL_IDENTIFIER)
count = int(row.MEDIA_COUNT)

if tool in ["CC0", "PDM"]:
key = "Public domain"

elif tool in ["CC BY"]:
key = "Permissive"

elif tool in ["CC BY-SA"]:
key = "Copyleft"

else:
key = "Restricted"

data[key] += count

data = pd.DataFrame(data.items(), columns=["Category", "Count"])
data.sort_values("Category", ascending=True, inplace=True)

file_path = shared.path_join(
PATHS["data_phase"], "openverse_totals_by_restriction.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])

file_count = shared.path_join(PATHS["data_1-fetch"], "openverse_fetch.csv")
count_data = shared.open_data_file(
LOGGER,
file_count,
usecols=["SOURCE", "MEDIA_TYPE", "TOOL_IDENTIFIER", "MEDIA_COUNT"],
)
process_totals_by_license(args, count_data)
process_totals_by_media_type(args, count_data)
process_totals_by_source(args, count_data)
process_permissive_by_media_type(args, count_data)
process_permissive_by_source(args, count_data)
process_totals_by_restriction(args, count_data)
# Push changes
args = shared.git_add_and_commit(
args,
PATHS["repo"],
PATHS["data_quarter"],
f"Add and commit new GitHub data for {QUARTER}",
)
shared.git_push_changes(args, PATHS["repo"])


if __name__ == "__main__":
try:
main()
except shared.QuantifyingException as e:
if e.exit_code == 0:
LOGGER.info(e.message)
else:
LOGGER.error(e.message)
sys.exit(e.code)
except SystemExit as e:
LOGGER.error(f"System exit with code: {e.code}")
sys.exit(e.code)
except KeyboardInterrupt:
LOGGER.info("(130) Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
sys.exit(1)
Loading