Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 8 additions & 43 deletions scripts/1-fetch/arxiv_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,31 +125,6 @@ def parse_arguments():
return args


def initialize_data_file(file_path, headers):
"""Initialize CSV file with headers if it doesn't exist."""
if not os.path.isfile(file_path):
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=headers, dialect="unix"
)
writer.writeheader()


def initialize_all_data_files(args):
"""Initialize all data files used by this script.

Creates the data directory and initializes empty CSVs with headers.
"""
if not args.enable_save:
return

os.makedirs(PATHS["data_1-fetch"], exist_ok=True)
initialize_data_file(FILE_ARXIV_COUNT, HEADER_COUNT)
initialize_data_file(FILE_ARXIV_CATEGORY_REPORT, HEADER_CATEGORY_REPORT)
initialize_data_file(FILE_ARXIV_YEAR, HEADER_YEAR)
initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET)


def get_identifier_mapping():
global IDENTIER_MAPPING
LOGGER.info("Loading CC Legal Tool metadata for CC identifer mapping")
Expand Down Expand Up @@ -472,19 +447,6 @@ def query_arxiv(args, session):
return data, cc_articles_found


def rows_to_csv(args, fieldnames, rows, file_path):
if not args.enable_save:
return args

with open(file_path, "w", encoding="utf-8", newline="\n") as file_handle:
writer = csv.DictWriter(
file_handle, fieldnames=fieldnames, dialect="unix"
)
writer.writeheader()
for row in rows:
writer.writerow(row)


def write_data(args, data):
"""
Write fetched data to CSV files.
Expand All @@ -508,7 +470,9 @@ def write_data(args, data):
}
)
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "AUTHOR_BUCKET"))
rows_to_csv(args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET)
shared.rows_to_csv(
args, HEADER_AUTHOR_BUCKET, rows, FILE_ARXIV_AUTHOR_BUCKET
)

# Save category report
# fetched_data["category_counts"]: {identifer: {category_code: count}}
Expand All @@ -527,15 +491,17 @@ def write_data(args, data):
}
)
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "CATEGORY_CODE"))
rows_to_csv(args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT)
shared.rows_to_csv(
args, HEADER_CATEGORY_REPORT, rows, FILE_ARXIV_CATEGORY_REPORT
)

# Save tool counts report
# fetched_data["tool_counts"]: {identfier: count}
rows = []
for identifier, count in data["tool_counts"].items():
rows.append({"TOOL_IDENTIFIER": identifier, "COUNT": count})
rows.sort(key=itemgetter("TOOL_IDENTIFIER"))
rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT)
shared.rows_to_csv(args, HEADER_COUNT, rows, FILE_ARXIV_COUNT)

# Save year count report
# fetched_data["year_counts"]: {identifer: {year: count}}
Expand All @@ -546,7 +512,7 @@ def write_data(args, data):
{"TOOL_IDENTIFIER": identifier, "YEAR": year, "COUNT": count}
)
rows.sort(key=itemgetter("TOOL_IDENTIFIER", "YEAR"))
rows_to_csv(args, HEADER_YEAR, rows, FILE_ARXIV_YEAR)
shared.rows_to_csv(args, HEADER_YEAR, FILE_ARXIV_YEAR, rows)


def write_provence(args, cc_articles_found):
Expand Down Expand Up @@ -584,7 +550,6 @@ def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])
initialize_all_data_files(args)
get_identifier_mapping()
session = shared.get_session()
query_category_mapping(args, session)
Expand Down
33 changes: 8 additions & 25 deletions scripts/1-fetch/gcs_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,25 +99,14 @@ def get_search_service():
)


def initialize_data_file(file_path, header):
if not os.path.isfile(file_path):
with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=header, dialect="unix"
)
writer.writeheader()


def initialize_all_data_files(args):
if not args.enable_save:
return

# Create data directory for this phase
os.makedirs(PATHS["data_phase"], exist_ok=True)

initialize_data_file(FILE1_COUNT, HEADER1_COUNT)
initialize_data_file(FILE2_LANGUAGE, HEADER2_LANGUAGE)
initialize_data_file(FILE3_COUNTRY, HEADER3_COUNTRY)
for file_path, header in [
(FILE1_COUNT, HEADER1_COUNT),
(FILE2_LANGUAGE, HEADER2_LANGUAGE),
(FILE3_COUNTRY, HEADER3_COUNTRY),
]:
if not os.path.isfile(file_path):
shared.rows_to_csv(args, file_path, header, [])


def get_last_completed_plan_index():
Expand Down Expand Up @@ -150,8 +139,6 @@ def load_plan():


def append_data(args, plan_row, index, count):
if not args.enable_save:
return
if plan_row["COUNTRY"]:
file_path = FILE3_COUNTRY
fieldnames = HEADER3_COUNTRY
Expand All @@ -178,11 +165,7 @@ def append_data(args, plan_row, index, count):
"TOOL_IDENTIFIER": plan_row["TOOL_IDENTIFIER"],
"COUNT": count,
}
with open(file_path, "a", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=fieldnames, dialect="unix"
)
writer.writerow(row)
shared.rows_to_csv(args, file_path, fieldnames, [row], append=True)


def query_gcs(args, service, last_completed_plan_index, plan):
Expand Down
32 changes: 7 additions & 25 deletions scripts/1-fetch/github_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
LOGGER, PATHS = shared.setup(__file__)

# Constants
FILE1_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv")
FILE_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv")
GH_TOKEN = os.getenv("GH_TOKEN")
# Also see: https://en.wikipedia.org/wiki/Public-domain-equivalent_license
GITHUB_TOOLS = [
Expand All @@ -40,7 +40,7 @@
{"TOOL_IDENTIFIER": "Unlicense", "SPDX_IDENTIFIER": "Unlicense"},
{"TOOL_IDENTIFIER": "Total public repositories", "SPDX_IDENTIFIER": "N/A"},
]
HEADER1_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"]
HEADER_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"]
QUARTER = os.path.basename(PATHS["data_quarter"])


Expand Down Expand Up @@ -68,7 +68,7 @@ def parse_arguments():

def check_for_completion():
try:
with open(FILE1_COUNT, "r", newline="") as file_obj:
with open(FILE_COUNT, "r", encoding="utf-8") as file_obj:
reader = csv.DictReader(file_obj, dialect="unix")
if len(list(reader)) == len(GITHUB_TOOLS):
raise shared.QuantifyingException(
Expand All @@ -78,27 +78,6 @@ def check_for_completion():
pass # File may not be found without --enable-save, etc.


def write_data(args, tool_data):
if not args.enable_save:
return args

# Create data directory for this phase
os.makedirs(PATHS["data_phase"], exist_ok=True)

if len(tool_data) < len(GITHUB_TOOLS):
LOGGER.error("Unable to fetch all records. Aborting.")
return args

with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
)
writer.writeheader()
for row in tool_data:
writer.writerow(row)
return args


def query_github(args, session):
tool_data = []
for tool in GITHUB_TOOLS:
Expand Down Expand Up @@ -148,7 +127,10 @@ def main():
session.headers.update({"authorization": f"Bearer {GH_TOKEN}"})

tool_data = query_github(args, session)
args = write_data(args, tool_data)
if len(tool_data) < len(GITHUB_TOOLS):
LOGGER.error("Unable to fetch all records. Aborting.")
return args
shared.rows_to_csv(args, FILE_COUNT, HEADER_COUNT, tool_data)
args = shared.git_add_and_commit(
args,
PATHS["repo"],
Expand Down
18 changes: 1 addition & 17 deletions scripts/1-fetch/openverse_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

# Standard library
import argparse
import csv
import os
import sys
import textwrap
Expand Down Expand Up @@ -192,27 +191,12 @@ def query_openverse(session):
return aggregate


def write_data(args, data):
if not args.enable_save:
return
os.makedirs(PATHS["data_phase"], exist_ok=True)
with open(FILE_PATH, "w", encoding="utf-8", newline="") as file_obj:
writer = csv.DictWriter(
file_obj,
fieldnames=OPENVERSE_FIELDS,
dialect="unix",
)
writer.writeheader()
for row in data:
writer.writerow(row)


def main():
args = parse_arguments()
LOGGER.info("Starting Openverse Fetch Script...")
session = shared.get_session(accept_header="application/json")
records = query_openverse(session)
write_data(args, records)
shared.rows_to_csv(args, FILE_PATH, OPENVERSE_FIELDS, records)
LOGGER.info(f"Fetched {len(records)} unique Openverse records.")


Expand Down
33 changes: 4 additions & 29 deletions scripts/1-fetch/smithsonian_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,15 @@ def check_for_completion():
completed_units = False

try:
with open(FILE_1_METRICS, "r", newline="") as file_obj:
with open(FILE_1_METRICS, "r", encoding="utf-8") as file_obj:
reader = csv.DictReader(file_obj, dialect="unix")
if len(list(reader)) > 0:
completed_metrics = True
except FileNotFoundError:
pass # File may not be found without --enable-save, etc.

try:
with open(FILE_2_UNITS, "r", newline="") as file_obj:
with open(FILE_2_UNITS, "r", encoding="utf-8") as file_obj:
reader = csv.DictReader(file_obj, dialect="unix")
if len(list(reader)) > 30:
completed_units = True
Expand All @@ -95,32 +95,6 @@ def check_for_completion():
)


def write_data(args, data_metrics, data_units):
if not args.enable_save:
return args

# Create data directory for this phase
os.makedirs(PATHS["data_phase"], exist_ok=True)

with open(FILE_1_METRICS, "w", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=HEADER_1_METRICS, dialect="unix"
)
writer.writeheader()
for row in data_metrics:
writer.writerow(row)

with open(FILE_2_UNITS, "w", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=HEADER_2_UNITS, dialect="unix"
)
writer.writeheader()
for row in data_units:
writer.writerow(row)

return args


def query_smithsonian(args, session):
if not DATA_GOV_API_KEY:
raise shared.QuantifyingException(
Expand Down Expand Up @@ -177,7 +151,8 @@ def main():
check_for_completion()
session = shared.get_session()
data_metrics, data_units = query_smithsonian(args, session)
args = write_data(args, data_metrics, data_units)
shared.rows_to_csv(args, FILE_1_METRICS, HEADER_1_METRICS, data_metrics)
shared.rows_to_csv(args, FILE_2_UNITS, HEADER_2_UNITS, data_units)
args = shared.git_add_and_commit(
args,
PATHS["repo"],
Expand Down
22 changes: 4 additions & 18 deletions scripts/1-fetch/wikipedia_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def parse_arguments():

def check_for_completion():
try:
with open(FILE_LANGUAGES, "r", newline="") as file_obj:
with open(
FILE_LANGUAGES, "r", encoding="utf-8", newline=""
) as file_obj:
reader = csv.DictReader(file_obj, dialect="unix")
if len(list(reader)) > 300:
raise shared.QuantifyingException(
Expand All @@ -75,22 +77,6 @@ def check_for_completion():
pass # File may not be found without --enable-save, etc.


def write_data(args, tool_data):
if not args.enable_save:
return args
LOGGER.info("Saving fetched data")
os.makedirs(PATHS["data_phase"], exist_ok=True)

with open(FILE_LANGUAGES, "w", encoding="utf-8", newline="\n") as file_obj:
writer = csv.DictWriter(
file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"
)
writer.writeheader()
for row in tool_data:
writer.writerow(row)
return args


def query_wikipedia_languages(session):
LOGGER.info("Fetching article counts from all language Wikipedias")
tool_data = []
Expand Down Expand Up @@ -173,7 +159,7 @@ def main():
shared.git_fetch_and_merge(args, PATHS["repo"])
session = shared.get_session()
tool_data = query_wikipedia_languages(session)
args = write_data(args, tool_data)
shared.rows_to_csv(args, FILE_LANGUAGES, HEADER_LANGUAGES, tool_data)
args = shared.git_add_and_commit(
args,
PATHS["repo"],
Expand Down
Loading