From 5887092c1b75e386f7b59b1a03f96681f245368d Mon Sep 17 00:00:00 2001 From: eddie-m-m Date: Fri, 15 Nov 2024 16:03:15 -0800 Subject: [PATCH 01/11] Add CKAN, MuckRock, and common_crawler to source_collectors dir --- source_collectors/ckan/README.md | 218 ++++++++++++ .../ckan/ckan_scraper_toolkit.py | 200 +++++++++++ source_collectors/ckan/requirements.txt | 6 + .../ckan/scrape_ckan_data_portals.py | 284 +++++++++++++++ source_collectors/ckan/search_terms.py | 32 ++ source_collectors/common_crawler/README.md | 87 +++++ source_collectors/common_crawler/__init__.py | 0 source_collectors/common_crawler/argparser.py | 74 ++++ source_collectors/common_crawler/cache.py | 91 +++++ source_collectors/common_crawler/config.ini | 19 + source_collectors/common_crawler/crawler.py | 130 +++++++ .../common_crawler/csv_manager.py | 78 +++++ .../common_crawler/data/cache.json | 7 + .../common_crawler/data/urls.csv | 207 +++++++++++ source_collectors/common_crawler/main.py | 328 ++++++++++++++++++ .../requirements_common_crawler_action.txt | 3 + source_collectors/common_crawler/utils.py | 22 ++ source_collectors/muckrock/.gitignore | 228 ++++++++++++ source_collectors/muckrock/README.md | 90 +++++ .../muckrock/allegheny-county-towns.txt | 61 ++++ .../convert_all_record_types_to_csv.py | 26 ++ .../muckrock/create_foia_data_db.py | 270 ++++++++++++++ .../muckrock/download_muckrock_foia.py | 43 +++ .../generate_detailed_muckrock_csv.py | 152 ++++++++ .../muckrock/get_all_record_types.py | 17 + .../muckrock/get_allegheny_foias.py | 74 ++++ source_collectors/muckrock/muck_get.py | 50 +++ .../muckrock/muckrock_ml_labeler.py | 41 +++ source_collectors/muckrock/requirements.txt | 30 ++ .../muckrock/search_foia_data_db.py | 181 ++++++++++ .../muckrock/search_local_foia_json.py | 38 ++ source_collectors/muckrock/utils.py | 27 ++ 32 files changed, 3114 insertions(+) create mode 100644 source_collectors/ckan/README.md create mode 100644 source_collectors/ckan/ckan_scraper_toolkit.py create mode 100644 source_collectors/ckan/requirements.txt create mode 100644 source_collectors/ckan/scrape_ckan_data_portals.py create mode 100644 source_collectors/ckan/search_terms.py create mode 100644 source_collectors/common_crawler/README.md create mode 100644 source_collectors/common_crawler/__init__.py create mode 100644 source_collectors/common_crawler/argparser.py create mode 100644 source_collectors/common_crawler/cache.py create mode 100644 source_collectors/common_crawler/config.ini create mode 100644 source_collectors/common_crawler/crawler.py create mode 100644 source_collectors/common_crawler/csv_manager.py create mode 100644 source_collectors/common_crawler/data/cache.json create mode 100644 source_collectors/common_crawler/data/urls.csv create mode 100644 source_collectors/common_crawler/main.py create mode 100644 source_collectors/common_crawler/requirements_common_crawler_action.txt create mode 100644 source_collectors/common_crawler/utils.py create mode 100644 source_collectors/muckrock/.gitignore create mode 100644 source_collectors/muckrock/README.md create mode 100644 source_collectors/muckrock/allegheny-county-towns.txt create mode 100644 source_collectors/muckrock/convert_all_record_types_to_csv.py create mode 100644 source_collectors/muckrock/create_foia_data_db.py create mode 100644 source_collectors/muckrock/download_muckrock_foia.py create mode 100644 source_collectors/muckrock/generate_detailed_muckrock_csv.py create mode 100644 source_collectors/muckrock/get_all_record_types.py create mode 100644 source_collectors/muckrock/get_allegheny_foias.py create mode 100644 source_collectors/muckrock/muck_get.py create mode 100644 source_collectors/muckrock/muckrock_ml_labeler.py create mode 100644 source_collectors/muckrock/requirements.txt create mode 100644 source_collectors/muckrock/search_foia_data_db.py create mode 100644 source_collectors/muckrock/search_local_foia_json.py create mode 100644 source_collectors/muckrock/utils.py diff --git a/source_collectors/ckan/README.md b/source_collectors/ckan/README.md new file mode 100644 index 00000000..be6c65cf --- /dev/null +++ b/source_collectors/ckan/README.md @@ -0,0 +1,218 @@ +# CKAN Scraper + +## Introduction + +This scraper can be used to retrieve package information from [CKAN](https://ckan.org/), which hosts open data projects such as . CKAN API documentation can be found at . + +Running the scraper will output a list of packages to a CSV file using the search terms. + +## Definitions + +* `Package` - Also called a dataset, is a page containing relevant information about a dataset. For example, this page is a package: . +* `Collection` - A grouping of child packages, related to a parent package. This is seperate from a group. +* `Group` - Also called a topic, is a grouping of packages. Packages in a group do not have a parent package. Groups can also contain subgroups. +* `Organization` - Organizations are what the data in packages belong to, such as "City of Austin" or "Department of Energy". Organization types are groups of organizations that share something in common with each other. + +## Files + +* `scrape_ckan_data_portals.py` - The main scraper file. Running this will execute a search accross multiple CKAN instances and output the results to a CSV file. +* `search_terms.py` - The search terms and CKAN portals to search from. +* `ckan_scraper_toolkit.py` - Toolkit of functions that use ckanapi to retrieve packages from CKAN data portals. + +## Setup + +1. In a terminal, navigate to the CKAN scraper folder + ```cmd + cd scrapers_library/data_portals/ckan/ + ``` +2. Create and activate a Python virtual environment + ```cmd + python -m venv venv + source venv/bin/activate + ``` + +3. Install the requirements + ```cmd + pip install -r requirements.txt + ``` +4. Run the multi-portal CKAN scraper + ```cmd + python scrape_ckan_data_portals.py + ``` +5. Review the generated `results.csv` file. + +## How can I tell if a website I want to scrape is hosted using CKAN? + +There's no easy way to tell, some websites will reference CKAN or link back to the CKAN documentation while others will not. There doesn't seem to be a database of all CKAN instances either. + +The best way to determine if a data catalog is using CKAN is to attempt to query its API. To do this: + +1. In a web browser, navigate to the website's data catalog (e.g. for data.gov this is at ) +2. Copy the first part of the link (e.g. ) +3. Paste it in the browser's URL bar and add `api/3/action/package_search` to the end (e.g. ) + +*NOTE: Some hosts use a different base URL for API requests. For example, Canada's Open Government Portal can be found at while the API access link is as described in their [Access our API](https://open.canada.ca/en/access-our-application-programming-interface-api) page* + +Another way to tell is by looking at the page layout. Most CKAN instances have a similar layout to one another. You can see an example at and . Both catalogues have a sidebar on the left with search refinement options, a search box on the top below the page title, and a list of datasets to the right of the sidebar among other similarities. + +## Documentation for ckan_scraper_toolkit.py + +### On ckanapi return data + +Accross CKAN instances, the ckanapi return data is largely the same in terms of layout. The key difference among these instances is in the `extras` key, where an instance may define its own custom keys. An example ckanapi return is provided below with truncation to save on space. This is the general layout that is returned by most of the toolkit's functions: + +```json +{ + "author": null, + "author_email": null, + "id": "f468fe8a-a319-464f-9374-f77128ffc9dc", + "maintainer": "NYC OpenData", + "maintainer_email": "no-reply@data.cityofnewyork.us", + "metadata_created": "2020-11-10T17:05:36.995577", + "metadata_modified": "2024-10-25T20:28:59.948113", + "name": "nypd-arrest-data-year-to-date", + "notes": "This is a breakdown of every arrest effected in NYC by the NYPD during the current year.\n This data is manually extracted every quarter and reviewed by the Office of Management Analysis and Planning. \n Each record represents an arrest effected in NYC by the NYPD and includes information about the type of crime, the location and time of enforcement. \nIn addition, information related to suspect demographics is also included. \nThis data can be used by the public to explore the nature of police enforcement activity. \nPlease refer to the attached data footnotes for additional information about this dataset.", + "organization": { + "id": "1149ee63-2fff-494e-82e5-9aace9d3b3bf", + "name": "city-of-new-york", + "title": "City of New York", + "description": "", + ... + }, + "title": "NYPD Arrest Data (Year to Date)", + "extras": [ + { + "key": "accessLevel", + "value": "public" + }, + { + "key": "landingPage", + "value": "https://data.cityofnewyork.us/d/uip8-fykc" + }, + { + "key": "publisher", + "value": "data.cityofnewyork.us" + }, + ... + ], + "groups": [ + { + "description": "Local Government Topic - for all datasets with state, local, county organizations", + "display_name": "Local Government", + "id": "7d625e66-9e91-4b47-badd-44ec6f16b62b", + "name": "local", + "title": "Local Government", + ... + } + ], + "resources": [ + { + "created": "2020-11-10T17:05:37.001960", + "description": "", + "format": "CSV", + "id": "c48f1a1a-5efb-4266-9572-769ed1c9b472", + "metadata_modified": "2020-11-10T17:05:37.001960", + "name": "Comma Separated Values File", + "no_real_name": true, + "package_id": "f468fe8a-a319-464f-9374-f77128ffc9dc", + "url": "https://data.cityofnewyork.us/api/views/uip8-fykc/rows.csv?accessType=DOWNLOAD", + ... + }, + { + "created": "2020-11-10T17:05:37.001970", + "describedBy": "https://data.cityofnewyork.us/api/views/uip8-fykc/columns.rdf", + "describedByType": "application/rdf+xml", + "description": "", + "format": "RDF", + "id": "5c137f71-4e20-49c5-bd45-a562952195fe", + "metadata_modified": "2020-11-10T17:05:37.001970", + "name": "RDF File", + "package_id": "f468fe8a-a319-464f-9374-f77128ffc9dc", + "url": "https://data.cityofnewyork.us/api/views/uip8-fykc/rows.rdf?accessType=DOWNLOAD", + ... + }, + ... + ], + "tags": [ + { + "display_name": "arrest", + "id": "a76dff3f-cba8-42b4-ab51-1aceb059d16f", + "name": "arrest", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "crime", + "id": "df442823-c823-4890-8fca-805427bd8dd9", + "name": "crime", + "state": "active", + "vocabulary_id": null + }, + ... + ], + "relationships_as_subject": [], + "relationships_as_object": [], + ... +} +``` + +--- +`ckan_package_search(base_url: str, query: Optional[str], rows: Optional[int], start: Optional[int], **kwargs) -> list[dict[str, Any]]` + +Searches for packages (datasets) in a CKAN data portal that satisfies a given search criteria. + +### Parameters + +* **base_url** - The base URL to search from. e.g. "https://catalog.data.gov/" +* **query (optional)** - The keyword string to search for. e.g. "police". Leaving empty will return all packages in the package list. Multi-word searches should be done with double quotes around the search term. For example, '"calls for service"' will return packages with the term "calls for service" while 'calls for service' will return packages with either "calls", "for", or "service" as keywords. +* **rows (optional)** - The maximum number of results to return. Leaving empty will return all results. +* **start (optional)** - Which result number to start at. Leaving empty will start at the first result. +* **kwargs (optional)** - Additional keyword arguments. For more information on acceptable keyword arguments and their function see + +### Return + +The function returns a list of dictionaries containing matching package results. + +--- + +`ckan_package_search_from_organization(base_url: str, organization_id: str) -> list[dict[str, Any]]` + +Returns a list of CKAN packages from an organization. Due to CKAN limitations, only 10 packages are able to be returned. + +### Parameters + +* **base_url** - The base URL to search from. e.g. "https://catalog.data.gov/" +* **organization_id** - The ID of the organization. This can be retrieved by searching for a package and finding the "id" key in the "organization" key. + +### Return + +The function returns a list of dictionaries containing matching package results. + +--- + +`ckan_group_package_show(base_url: str, id: str, limit: Optional[int]) -> list[dict[str, Any]]` + +Returns a list of CKAN packages that belong to a particular group. + +* **base_url** - The base URL of the CKAN portal. e.g. "https://catalog.data.gov/" +* **id** - The group's ID. This can be retrieved by searching for a package and finding the "id" key in the "groups" key. +* **limit** - The maximum number of results to return, leaving empty will return all results. + +### Return + +The function returns a list of dictionaries representing the packages associated with the group. + +--- + +`ckan_collection_search(base_url: str, collection_id: str) -> list[Package]` + +Returns a list of CKAN package information that belong to a collection. When querying the API, CKAN data portals are supposed to have relationships returned along with the rest of the data. However, in practice not all data portals have it set up this way. Since child packages are not able to be queried directly, they will not show up in any search results. To get around this, this function will manually scrape the information of all child packages related to the given parent. + +*NOTE: This function has only been tested on . It is likely it will not work properly on other platforms.* + +* **base_url** - The base URL of the CKAN portal before the collection ID. e.g. "https://catalog.data.gov/dataset/" +* **collection_id** - The ID of the parent package. This can be found by querying the parent package and using the "id" key, or by navigating to the list of child packages and looking in the URL. e.g. In the collection_id is "7b1d1941-b255-4596-89a6-99e1a33cc2d8" + +### Return + +List of Package objects representing the child packages associated with the collection. diff --git a/source_collectors/ckan/ckan_scraper_toolkit.py b/source_collectors/ckan/ckan_scraper_toolkit.py new file mode 100644 index 00000000..0d9dc449 --- /dev/null +++ b/source_collectors/ckan/ckan_scraper_toolkit.py @@ -0,0 +1,200 @@ +"""Toolkit of functions that use ckanapi to retrieve packages from CKAN data portals""" +from concurrent.futures import as_completed, ThreadPoolExecutor +from dataclasses import dataclass, field +from datetime import datetime +import math +import sys + +import time +from typing import Any, Optional +from urllib.parse import urljoin + +from bs4 import BeautifulSoup +from ckanapi import RemoteCKAN +import requests + + +@dataclass +class Package: + base_url: str = "" + url: str = "" + title: str = "" + agency_name: str = "" + description: str = "" + supplying_entity: str = "" + record_format: list = field(default_factory=lambda: []) + data_portal_type: str = "" + source_last_updated: str = "" + + def to_dict(self): + return { + "source_url": self.url, + "submitted_name": self.title, + "agency_name": self.agency_name, + "description": self.description, + "supplying_entity": self.supplying_entity, + "record_format": self.record_format, + "data_portal_type": self.data_portal_type, + "source_last_updated": self.source_last_updated, + } + + +def ckan_package_search( + base_url: str, + query: Optional[str] = None, + rows: Optional[int] = sys.maxsize, + start: Optional[int] = 0, + **kwargs, +) -> list[dict[str, Any]]: + """Performs a CKAN package (dataset) search from a CKAN data catalog URL. + + :param base_url: Base URL to search from. e.g. "https://catalog.data.gov/" + :param query: Search string, defaults to None. None will return all packages. + :param rows: Maximum number of results to return, defaults to maximum integer. + :param start: Offsets the results, defaults to 0. + :param kwargs: See https://docs.ckan.org/en/2.10/api/index.html#ckan.logic.action.get.package_search for additional arguments. + :return: List of dictionaries representing the CKAN package search results. + """ + remote = RemoteCKAN(base_url, get_only=True) + results = [] + offset = start + rows_max = 1000 # CKAN's package search has a hard limit of 1000 packages returned at a time by default + + while start < rows: + num_rows = rows - start + offset + packages = remote.action.package_search( + q=query, rows=num_rows, start=start, **kwargs + ) + # Add the base_url to each package + [package.update(base_url=base_url) for package in packages["results"]] + results += packages["results"] + + total_results = packages["count"] + if rows > total_results: + rows = total_results + + result_len = len(packages["results"]) + # Check if the website has a different rows_max value than CKAN's default + if result_len != rows_max and start + rows_max < total_results: + rows_max = result_len + + start += rows_max + + return results + + +def ckan_package_search_from_organization( + base_url: str, organization_id: str +) -> list[dict[str, Any]]: + """Returns a list of CKAN packages from an organization. Only 10 packages are able to be returned. + + :param base_url: Base URL of the CKAN portal. e.g. "https://catalog.data.gov/" + :param organization_id: The organization's ID. + :return: List of dictionaries representing the packages associated with the organization. + """ + remote = RemoteCKAN(base_url, get_only=True) + organization = remote.action.organization_show( + id=organization_id, include_datasets=True + ) + packages = organization["packages"] + results = [] + + for package in packages: + query = f"id:{package['id']}" + results += ckan_package_search(base_url=base_url, query=query) + + return results + + +def ckan_group_package_show( + base_url: str, id: str, limit: Optional[int] = sys.maxsize +) -> list[dict[str, Any]]: + """Returns a list of CKAN packages from a group. + + :param base_url: Base URL of the CKAN portal. e.g. "https://catalog.data.gov/" + :param id: The group's ID. + :param limit: Maximum number of results to return, defaults to maximum integer. + :return: List of dictionaries representing the packages associated with the group. + """ + remote = RemoteCKAN(base_url, get_only=True) + results = remote.action.group_package_show(id=id, limit=limit) + # Add the base_url to each package + [package.update(base_url=base_url) for package in results] + return results + + +def ckan_collection_search(base_url: str, collection_id: str) -> list[Package]: + """Returns a list of CKAN packages from a collection. + + :param base_url: Base URL of the CKAN portal before the collection ID. e.g. "https://catalog.data.gov/dataset/" + :param collection_id: The ID of the parent package. + :return: List of Package objects representing the packages associated with the collection. + """ + packages = [] + url = f"{base_url}?collection_package_id={collection_id}" + soup = _get_soup(url) + + # Calculate the total number of pages of packages + num_results = int(soup.find(class_="new-results").text.split()[0].replace(",", "")) + pages = math.ceil(num_results / 20) + + for page in range(1, pages + 1): + url = f"{base_url}?collection_package_id={collection_id}&page={page}" + soup = _get_soup(url) + + with ThreadPoolExecutor(max_workers=10) as executor: + futures = [ + executor.submit( + _collection_search_get_package_data, dataset_content, base_url + ) + for dataset_content in soup.find_all(class_="dataset-content") + ] + + [ + packages.append(package.result()) + for package in as_completed(futures) + ] + + # Take a break to avoid being timed out + if len(futures) >= 15: + time.sleep(10) + + return packages + + +def _collection_search_get_package_data(dataset_content, base_url: str): + """Parses the dataset content and returns a Package object.""" + package = Package() + joined_url = urljoin(base_url, dataset_content.a.get("href")) + dataset_soup = _get_soup(joined_url) + # Determine if the dataset url should be the linked page to an external site or the current site + resources = dataset_soup.find("section", id="dataset-resources").find_all( + class_="resource-item" + ) + button = resources[0].find(class_="btn-group") + if len(resources) == 1 and button is not None and button.a.text == "Visit page": + package.url = button.a.get("href") + else: + package.url = joined_url + package.data_portal_type = "CKAN" + package.base_url = base_url + package.title = dataset_soup.find(itemprop="name").text.strip() + package.agency_name = dataset_soup.find("h1", class_="heading").text.strip() + package.supplying_entity = dataset_soup.find(property="dct:publisher").text.strip() + package.description = dataset_soup.find(class_="notes").p.text + package.record_format = [ + record_format.text.strip() for record_format in dataset_content.find_all("li") + ] + package.record_format = list(set(package.record_format)) + + date = dataset_soup.find(property="dct:modified").text.strip() + package.source_last_updated = datetime.strptime(date, "%B %d, %Y").strftime("%Y-%d-%m") + + return package + + +def _get_soup(url: str) -> BeautifulSoup: + """Returns a BeautifulSoup object for the given URL.""" + time.sleep(1) + response = requests.get(url) + return BeautifulSoup(response.content, "lxml") diff --git a/source_collectors/ckan/requirements.txt b/source_collectors/ckan/requirements.txt new file mode 100644 index 00000000..fc41154b --- /dev/null +++ b/source_collectors/ckan/requirements.txt @@ -0,0 +1,6 @@ +from_root +ckanapi +bs4 +lxml +tqdm +pandas \ No newline at end of file diff --git a/source_collectors/ckan/scrape_ckan_data_portals.py b/source_collectors/ckan/scrape_ckan_data_portals.py new file mode 100644 index 00000000..ef83b4dc --- /dev/null +++ b/source_collectors/ckan/scrape_ckan_data_portals.py @@ -0,0 +1,284 @@ +"""Retrieves packages from CKAN data portals and parses relevant information then outputs to a CSV file""" +from itertools import chain +import json +import sys +from typing import Any, Callable, Optional + +from from_root import from_root +import pandas as pd +from tqdm import tqdm + +p = from_root("CONTRIBUTING.md").parent +sys.path.insert(1, str(p)) + +from scrapers_library.data_portals.ckan.ckan_scraper_toolkit import ( + ckan_package_search, + ckan_group_package_show, + ckan_collection_search, + ckan_package_search_from_organization, + Package, +) +from search_terms import package_search, group_search, organization_search + + +def perform_search( + search_func: Callable, + search_terms: list[dict[str, Any]], + results: list[dict[str, Any]], +): + """Executes a search function with the given search terms. + + :param search_func: The search function to execute. + :param search_terms: The list of urls and search terms. + :param results: The list of results. + :return: Updated list of results. + """ + key = list(search_terms[0].keys())[1] + for search in tqdm(search_terms): + results += [search_func(search["url"], item) for item in search[key]] + + return results + + +def get_collection_child_packages( + results: list[dict[str, Any]] +) -> list[dict[str, Any]]: + """Retrieves the child packages of each collection. + + :param results: List of results. + :return: List of results containing child packages. + """ + new_list = [] + + for result in tqdm(results): + if "extras" in result.keys(): + collections = [ + ckan_collection_search( + base_url="https://catalog.data.gov/dataset/", + collection_id=result["id"], + ) + for extra in result["extras"] + if extra["key"] == "collection_metadata" + and extra["value"] == "true" + and not result["resources"] + ] + + if collections: + new_list += collections[0] + continue + + new_list.append(result) + + return new_list + + +def filter_result(result: dict[str, Any] | Package): + """Filters the result based on the defined criteria. + + :param result: The result to filter. + :return: True if the result should be included, False otherwise. + """ + if isinstance(result, Package) or "extras" not in result.keys(): + return True + + for extra in result["extras"]: + # Remove parent packages with no resources + if ( + extra["key"] == "collection_metadata" + and extra["value"] == "true" + and not result["resources"] + ): + return False + # Remove non-public packages + elif extra["key"] == "accessLevel" and extra["value"] == "non-public": + return False + + # Remove packages with no data or landing page + if len(result["resources"]) == 0: + landing_page = next( + (extra for extra in result["extras"] if extra["key"] == "landingPage"), None + ) + if landing_page is None: + return False + + return True + + +def parse_result(result: dict[str, Any] | Package) -> dict[str, Any]: + """Retrieves the important information from the package. + + :param result: The result to parse. + :return: The parsed result as a dictionary. + """ + package = Package() + + if isinstance(result, Package): + package.record_format = get_record_format_list(package) + return package.to_dict() + + package.record_format = get_record_format_list( + package=package, resources=result["resources"] + ) + + package = get_source_url(result, package) + package.title = result["title"] + package.description = result["notes"] + package.agency_name = result["organization"]["title"] + package.supplying_entity = get_supplying_entity(result) + package.source_last_updated = result["metadata_modified"][0:10] + + return package.to_dict() + + +def get_record_format_list( + package: Package, + resources: Optional[list[dict[str, Any]]] = None, +) -> list[str]: + """Retrieves the record formats from the package's resources. + + :param package: The package to retrieve record formats from. + :param resources: The list of resources. + :return: List of record formats. + """ + data_types = [ + "CSV", + "PDF", + "XLS", + "XML", + "JSON", + "Other", + "RDF", + "GIS / Shapefile", + "HTML text", + "DOC / TXT", + "Video / Image", + ] + type_conversion = { + "XLSX": "XLS", + "Microsoft Excel": "XLS", + "KML": "GIS / Shapefile", + "GeoJSON": "GIS / Shapefile", + "application/vnd.geo+json": "GIS / Shapefile", + "ArcGIS GeoServices REST API": "GIS / Shapefile", + "Esri REST": "GIS / Shapefile", + "SHP": "GIS / Shapefile", + "OGC WMS": "GIS / Shapefile", + "QGIS": "GIS / Shapefile", + "gml": "GIS / Shapefile", + "WFS": "GIS / Shapefile", + "WMS": "GIS / Shapefile", + "API": "GIS / Shapefile", + "HTML": "HTML text", + "HTML page": "HTML text", + "": "HTML text", + "TEXT": "DOC / TXT", + "JPEG": "Video / Image", + "Api": "JSON", + "CSV downloads": "CSV", + "csv file": "CSV", + } + + if resources is None: + resources = package.record_format + package.record_format = [] + + for resource in resources: + if isinstance(resource, str): + format = resource + else: + format = resource["format"] + + # Is the format one of our conversion types? + if format in type_conversion.keys(): + format = type_conversion[format] + + # Add the format to the package's record format list if it's not already there and is a valid data type + if format not in package.record_format and format in data_types: + package.record_format.append(format) + + if format not in data_types: + package.record_format.append("Other") + + return package.record_format + + +def get_source_url(result: dict[str, Any], package: Package) -> Package: + """Retrieves the source URL from the package's resources. + + :param result: The result to retrieve source URL from. + :param package: The package to update with the source URL. + :return: The updated package. + """ + # If there is only one resource available and it's a link + if len(result["resources"]) == 1 and package.record_format == ["HTML text"]: + # Use the link to the external page + package.url = result["resources"][0]["url"] + # If there are no resources available + elif len(result["resources"]) == 0: + # Use the dataset's external landing page + package.url = [ + extra["value"] + for extra in result["extras"] + if extra["key"] == "landingPage" + ] + package.record_format = ["HTML text"] + else: + # Use the package's dataset information page + package.url = f"{result['base_url']}dataset/{result['name']}" + package.data_portal_type = "CKAN" + + return package + + +def get_supplying_entity(result: dict[str, Any]) -> str: + """Retrieves the supplying entity from the package's extras. + + :param result: The result to retrieve supplying entity from. + :return: The supplying entity. + """ + if "extras" not in result.keys(): + return result["organization"]["title"] + + for extra in result["extras"]: + if extra["key"] == "publisher": + return extra["value"] + + return result["organization"]["title"] + + +def main(): + results = [] + + print("Gathering results...") + results = perform_search( + search_func=ckan_package_search, + search_terms=package_search, + results=results, + ) + results = perform_search( + search_func=ckan_group_package_show, + search_terms=group_search, + results=results, + ) + results = perform_search( + search_func=ckan_package_search_from_organization, + search_terms=organization_search, + results=results, + ) + + flat_list = list(chain(*results)) + # Deduplicate entries + flat_list = [i for n, i in enumerate(flat_list) if i not in flat_list[n + 1 :]] + print("\nRetrieving collections...") + flat_list = get_collection_child_packages(flat_list) + + filtered_results = list(filter(filter_result, flat_list)) + parsed_results = list(map(parse_result, filtered_results)) + + # Write to CSV + df = pd.DataFrame(parsed_results) + df.to_csv("results.csv") + + +if __name__ == "__main__": + main() diff --git a/source_collectors/ckan/search_terms.py b/source_collectors/ckan/search_terms.py new file mode 100644 index 00000000..7fdbc34e --- /dev/null +++ b/source_collectors/ckan/search_terms.py @@ -0,0 +1,32 @@ +package_search = [ + { + "url": "https://catalog.data.gov/", + "terms": [ + "police", + "crime", + "tags:(court courts court-cases criminal-justice-system law-enforcement law-enforcement-agencies)", + ], + }, + {"url": "https://data.boston.gov/", "terms": ["police"]}, + {"url": "https://open.jacksonms.gov/", "terms": ["tags:police"]}, + {"url": "https://data.milwaukee.gov/", "terms": ["mpd", "wibr"]}, + {"url": "https://data.sanantonio.gov/", "terms": ["sapd"]}, + {"url": "https://data.sanjoseca.gov/", "terms": ["police"]} +] + +group_search = [ + { + "url": "https://data.birminghamal.gov/", + "ids": [ + "3c648d96-0a29-4deb-aa96-150117119a23", + "92654c61-3a7d-484f-a146-257c0f6c55aa", + ], + }, +] + +organization_search = [ + { + "url": "https://data.houstontx.gov/", + "ids": ["d6f4346d-f298-498d-b8dd-a4b95ee0846b"], + }, +] diff --git a/source_collectors/common_crawler/README.md b/source_collectors/common_crawler/README.md new file mode 100644 index 00000000..3701b5d5 --- /dev/null +++ b/source_collectors/common_crawler/README.md @@ -0,0 +1,87 @@ +# Common Crawler + +This module interfaces with the Common Crawl dataset to extract urls. + +## Installation + +Python Version Required: 3.11 + +To install all necessary dependencies, run the following command from the root directory: + +```bash +pip install -r requirements.txt +``` + + +## Usage Example + +### Environment Requirements + +Please ensure you have a `.env` file located in the root directory (not the `common_crawler` directory) +which contains the following environment variable: + +* HUGGINGFACE_ACCESS_TOKEN = The access token to enable writing to the associated PDAP dataset. +To obtain your access token, consult user settings at +and ensure you have write access to . +* LABEL_STUDIO_ACCESS_TOKEN = The access token for the Label Studio API. This can be + obtained by logging into Label Studio and navigating to the [user account section](https://app.heartex.com/user/account), where the access token can be copied. +* LABEL_STUDIO_PROJECT_ID = The project ID for the Label Studio API. This can be + obtained by logging into Label Studio and navigating to the relevant project, where the project id will be in the URL. + +### Instructions + +Run the following script from the root directory +```bash +python common_crawler/main.py CC-MAIN-2023-50 '*.gov' police --config common_crawler/config.ini --pages 2 +``` + +This example will crawl a single page (typically 15000 records) of the Common Crawl dataset with ID `CC-MAIN-2023-50` +and search for the term `police` in all the pages with the `.gov` domain. It will use the default configuration file `config.ini` +to determine the json cache location and the location of the output csv file. + +Note that the cache records the most recent page number that was used for given combination of Common Crawl ID, url search term, and keyword. +If the same command is run again, it will start from the next page. +If you want to reset the cache, you can use the `--reset-cache` flag. + +By default, the output csv file will be named `urls.csv` and will be located in the `data` directory of the module. +This csv file contains both the url and the parameters used to query it. + +### Parameters + +- **common_crawl_id**: Required. Specifies the Common Crawl Index to perform the search on. +- **url**: Required. Specifies the domain URL to query. Wildcard characters such as * can be used to expand the search. Note that the query must be contained within quotes (as in '*.gov') to prevent misinterpretation of wildcards +- **search_term**: Required. Specifies keyword within the url to search for. +- **-c or --config**: Optional. Specifies the configuration file to use. The default value is config.ini. +- **-p or --pages**: Optional. Specifies the number of pages to search. The default value is 1. +- **--reset-cache**: Optional. If set, it resets the cache before starting the crawl. + +### Configuration + +Several attributes are currently defined in `config.ini`: +- **cache_filename**: This is the name of the cache file. The default value is `cache`. The file will be saved with a `.json` extension. +- **output_filename**: This is the name of the output file. The default value is `urls`. The file will be saved with a `.csv` extension. +- **data_dir**: This is the directory where the cache and output files will be saved. The default value is `data`. +- **huggingface_repo_id**: This is the repository ID for the hugging face dataset which urls will be uploaded to + +## Code Structure + +The code is structured as follows: +- **main.py**: This is the main file that is used to run the module. It contains the logic to parse the command line arguments and call the necessary functions. +- **crawler.py**: This file contains the logic to interface with the Common Crawl dataset and extract urls. +- **cache.py**: This file contains the logic to read and write the cache file. +- **argparser.py**: This file contains the logic to parse the command line and config arguments. +- **csv_manager.py**: This file contains the logic to write the output csv file. +- **utils.py**: This file contains utility functions. +- **config.ini**: This file contains the default configuration values. +- **README.md**: This file contains the documentation for the module. You're reading it right now. Isn't that nifty! + +## Testing + +A suite of unit and integration tests were developed for this module. + +To run the tests, run the following command from this directory: + +```bash +pytest ../tests/test_common_crawler_integration.py +pytest ../tests/test_common_crawler_unit.py +``` \ No newline at end of file diff --git a/source_collectors/common_crawler/__init__.py b/source_collectors/common_crawler/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/source_collectors/common_crawler/argparser.py b/source_collectors/common_crawler/argparser.py new file mode 100644 index 00000000..8cdf5b78 --- /dev/null +++ b/source_collectors/common_crawler/argparser.py @@ -0,0 +1,74 @@ +import argparse +import configparser +import re + +""" +This module contains the argument parser for command line arguments +for the Common Crawler script. +""" + +def valid_common_crawl_id(common_crawl_id: str) -> bool: + """ + Validate the Common Crawl ID format. + The Common Crawl ID should be in the format CC-MAIN-YYYY-WW. + Args: + common_crawl_id: The Common Crawl ID to validate + Returns: + True if the Common Crawl ID is valid, False otherwise + """ + return re.match(r'CC-MAIN-\d{4}-\d{2}', common_crawl_id) is not None + +def parse_args() -> argparse.Namespace: + """ + Parse the command line arguments for the Common Crawler script + as well as the configuration file. + Arguments parsed include: + - The Common Crawl ID + - The URL to query + - The search term + - The number of pages to search + - The configuration file (defaults to config.ini) + - A flag to reset the cache + Returns: The parsed arguments + """ + + parser = argparse.ArgumentParser( + description='Query the Common Crawl dataset and optionally save the results to a file.') + # Add the required arguments + parser.add_argument('common_crawl_id', type=str, help='The Common Crawl ID') + parser.add_argument('url', type=str, help='The URL to query') + parser.add_argument('keyword', type=str, help='The keyword to search in the url') + # Optional arguments for the number of pages and the output file, and a flag to reset the cache + parser.add_argument('-c', '--config', type=str, default='config.ini', help='The configuration file to use') + parser.add_argument('-p', '--pages', type=int, default=1, help='The number of pages to search (default: 1)') + parser.add_argument('--reset-cache', action='store_true', default=False, + help='Reset the cache before starting the crawl') + + args = parser.parse_args() + + # Validate the Common Crawl ID format + if not valid_common_crawl_id(args.common_crawl_id): + parser.error("Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW.") + + # Read the configuration file + config = configparser.ConfigParser() + config.read(args.config) + + # Combine parsed arguments with configuration file defaults + app_parser = argparse.ArgumentParser(parents=[parser], add_help=False) + app_parser.set_defaults(**config['DEFAULT']) + + app_args = app_parser.parse_args() + + # Print arguments + print(f"--Common Crawl ID: {app_args.common_crawl_id}") + print(f"--URL: {app_args.url}") + print(f"--Keyword: {app_args.keyword}") + print(f"--Number of Pages: {app_args.pages}") + print(f"--Configuration File: {app_args.config}") + print(f"--Reset Cache: {app_args.reset_cache}") + print(f"--Output File: {app_args.output_filename}.csv") + print(f"--Cache File: {app_args.cache_filename}.json") + print(f"--Data Directory: {app_args.data_dir}") + + return app_args diff --git a/source_collectors/common_crawler/cache.py b/source_collectors/common_crawler/cache.py new file mode 100644 index 00000000..2a48c0b7 --- /dev/null +++ b/source_collectors/common_crawler/cache.py @@ -0,0 +1,91 @@ +import json + +from util.miscellaneous_functions import get_file_path + +""" +This module contains classes for managing a cache of Common Crawl search results +These classes include: + - CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results +""" + +class CommonCrawlerCacheManager: + """ + A class for managing the cache of Common Crawl search results. + This class is responsible for adding, retrieving, and saving cache data. + """ + def __init__(self, file_name: str = "cache", directory=None): + """ + Initializes the CacheStorage object with a file name and directory. + Args: + file_name: the name of the cache file + directory: the directory to store the cache file + """ + self.file_path = get_file_path(f"{file_name}.json", directory) + print(f"Cache file path: {self.file_path}") + self.cache = self.load_or_create_cache() + + def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None: + """ + Updates the cache with the last page crawled for a given index, url, and keyword. + Or adds a new cache object if it does not exist. + Args: + index: the index of the common crawl + url: the url to search + keyword: the search term to use + last_page: the last page crawled + Returns: None + """ + if index not in self.cache: + self.cache[index] = {} + if url not in self.cache[index]: + self.cache[index][url] = {} + self.cache[index][url][keyword] = last_page + + + def get(self, index, url, keyword) -> int: + """ + Retrieves a page number from the cache. + Args: + index: the index of the common crawl + url: the url to search + keyword: the search term to use + + Returns: int - the last page crawled + + """ + if index in self.cache and url in self.cache[index] and keyword in self.cache[index][url]: + return self.cache[index][url][keyword] + # The cache object does not exist. Return 0 as the default value. + return 0 + + + def load_or_create_cache(self) -> dict: + """ + Loads the cache from the configured file path. + If the file does not exist, an empty dictionary is returned. + Returns: dict - the cache data + """ + try: + with open(self.file_path, 'r') as file: + return json.load(file) + except FileNotFoundError: + return {} + + + def save_cache(self) -> None: + """ + Converts the cache object into a JSON-serializable format and saves it to the configured file path. + This method ensures the cache is stored in a readable and easily reloadable format, allowing for + persistence of crawl data across sessions. + """ + # Reformat cache data for JSON serialization + with open(self.file_path, 'w') as file: + json.dump(self.cache, file, indent=4) + + + def reset_cache(self) -> None: + """ + Resets the cache to an empty state. + """ + self.cache = {} + print("Cache has been reset.") diff --git a/source_collectors/common_crawler/config.ini b/source_collectors/common_crawler/config.ini new file mode 100644 index 00000000..fc558303 --- /dev/null +++ b/source_collectors/common_crawler/config.ini @@ -0,0 +1,19 @@ +# This configuration file contains default settings for the Common Crawler application. +# Settings can be modified to suit different environments or testing needs. + +[DEFAULT] +# Filename for the cache. Stores which pages have been crawled +# at which combinations of index, url search term, and keyword +# to avoid re-crawling them. +cache_filename = cache + +# Directory where data files (both cache and output) are stored. +# Change as needed for different environments. +# Path is relative from working directory that executes common_crawler/main.py +data_dir = common_crawler/data + +# Filename for the output CSV containing crawled URLs. +output_filename = urls + +# Name of the huggingface repo +huggingface_repo_id = PDAP/unlabeled-urls \ No newline at end of file diff --git a/source_collectors/common_crawler/crawler.py b/source_collectors/common_crawler/crawler.py new file mode 100644 index 00000000..9afba7d8 --- /dev/null +++ b/source_collectors/common_crawler/crawler.py @@ -0,0 +1,130 @@ +import json +import time +from urllib.parse import quote_plus +from http import HTTPStatus + +import requests + +from .utils import URLWithParameters +from dataclasses import dataclass +from collections import namedtuple + +""" +This module contains classes for managing a cache of Common Crawl search results +""" + +# TODO: What happens when no results are found? How does the CommonCrawlerManager handle this? + + + +@dataclass +class CommonCrawlResult: + last_page_search: int + url_results: list[str] + + +class CommonCrawlerManager: + """ + This class orchestrates the crawling process, leveraging CommonCrawler for + actual interactions with the Common Crawl Index Server and CommonCrawlerCacheManager + for caching results. + It validates crawl ids, manages pagination, and aggregates results. + """ + + def __init__(self, crawl_id='CC-MAIN-2023-50'): + self.crawl_id = crawl_id + CC_INDEX_SERVER = 'http://index.commoncrawl.org/' + INDEX_NAME = f'{self.crawl_id}-index' + self.root_url = f'{CC_INDEX_SERVER}{INDEX_NAME}' + + def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResult: + print( + f"Searching for {keyword} on {search_term} in {self.crawl_id} for {num_pages} pages," + f" starting at page {start_page}") + + url_results = [] + + end_page = start_page + num_pages + last_page = start_page + + for next_page in range(start_page, end_page): + records = self.search_common_crawl_index(search_term, next_page) + + # If records were found, filter them and add to results + if not records: + continue + + keyword_urls = self.get_urls_with_keyword(records, keyword) + url_results.extend(keyword_urls) + + last_page = next_page + + # Wait 5 seconds before making the next request, to avoid overloading the server + time.sleep(5) + + return CommonCrawlResult(last_page, url_results) + + def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = 20) -> list[dict]: + """ + This method is used to search the Common Crawl index for a given URL and page number + Args: + url: a URL to search for + page: the page number to search + + Returns: A list of records (dictionaries) containing the search results + + """ + encoded_url = quote_plus(url) + search_url = URLWithParameters(self.root_url) + search_url.add_parameter('url', encoded_url) + search_url.add_parameter('output', 'json') + search_url.add_parameter('page', page) + + retries = 0 + delay = 1 + + # put HTTP GET request in re-try loop in case of rate limiting. Once per second is nice enough per common crawl doc. + while retries < max_retries: + response = self.make_request(search_url) + if response: + return self.process_response(response, url, page) + + retries += 1 + print(f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})") + time.sleep(delay) + + print(f"Max retries exceeded. Failed to get records for {url} on page {page}.") + return None + + def make_request(self, search_url: str) -> requests.Response: + """ + Makes the HTTP GET request to the given search URL. + Return the response if successful, None if rate-limited. + """ + try: + response = requests.get(str(search_url)) + response.raise_for_status() + return response + except requests.exceptions.RequestException as e: + if response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR and 'SlowDown' in response.text: + return None + else: + print(f"Failed to get records: {e}") + return None + + def process_response(self, response: requests.Response, url: str, page: int) -> list[dict]: + """Processes the HTTP response and returns the parsed records if successful.""" + if response.status_code == HTTPStatus.OK: + records = response.text.strip().split('\n') + print(f"Found {len(records)} records for {url} on page {page}") + return [json.loads(record) for record in records] + elif 'First Page is 0, Last Page is 0' in response.text: + print("No records exist in index matching the url search term") + return None + else: + print(f"Unexpected response: {response.status_code}") + return None + + @staticmethod + def get_urls_with_keyword(records: list[dict], keyword) -> list[str]: + return [record['url'] for record in records if keyword in record['url']] diff --git a/source_collectors/common_crawler/csv_manager.py b/source_collectors/common_crawler/csv_manager.py new file mode 100644 index 00000000..69868629 --- /dev/null +++ b/source_collectors/common_crawler/csv_manager.py @@ -0,0 +1,78 @@ +import csv +import os + +from util.miscellaneous_functions import get_file_path + + +class CSVManager: + """ + Manages a CSV file for storing URLs. + Creates the file if it doesn't exist, and provides a method for adding new rows. + """ + + def __init__( + self, + file_name: str, + headers: list[str], + directory=None + ): + self.file_path = get_file_path(f"{file_name}.csv", directory) + self.headers = headers + if not os.path.exists(self.file_path): + self.initialize_file() + + def add_row(self, row_values: list[str] | tuple[str]): + """ + Appends a new row of data to the CSV. + Args: + row_values: list of values to add to the csv, in order of their inclusion in the list + """ + if isinstance(row_values, str): + # Single values must be converted to a list format + row_values = [row_values] + try: + with open(self.file_path, mode='a', newline='', encoding='utf-8') as file: + writer = csv.writer(file) + writer.writerow(row_values) + except Exception as e: + print(f"An error occurred while trying to write to {self.file_path}: {e}") + + def add_rows(self, results: list[list[str]]) -> None: + """ + Appends multiple rows of data to the CSV as a list of lists of strings. + Args: + results: list[list[str] - a list of lists of strings, each inner list representing a row + Returns: None + """ + for result in results: + self.add_row( + result + ) + print(f"{len(results)} URLs written to {self.file_path}") + + def initialize_file(self): + """ + Initializes the CSV file. + If the file doesn't exist, it creates it with the header row. + """ + # check if file exists + file_exists = os.path.isfile(self.file_path) + + if not file_exists: + with open(self.file_path, mode='a', newline='', encoding='utf-8') as file: + writer = csv.writer(file) + writer.writerow(self.headers) + else: + # Open and check that headers match + with open(self.file_path, mode='r', encoding='utf-8') as file: + header_row = next(csv.reader(file)) + if header_row != self.headers: + raise ValueError(f"Header row in {self.file_path} does not match expected headers") + print(f"CSV file initialized at {self.file_path}") + + def delete_file(self): + """ + Deletes the CSV file. + """ + os.remove(self.file_path) + print(f"CSV file deleted at {self.file_path}") diff --git a/source_collectors/common_crawler/data/cache.json b/source_collectors/common_crawler/data/cache.json new file mode 100644 index 00000000..e12687ad --- /dev/null +++ b/source_collectors/common_crawler/data/cache.json @@ -0,0 +1,7 @@ +{ + "CC-MAIN-2023-50": { + "*.gov": { + "police": 10 + } + } +} \ No newline at end of file diff --git a/source_collectors/common_crawler/data/urls.csv b/source_collectors/common_crawler/data/urls.csv new file mode 100644 index 00000000..6fc4dc6f --- /dev/null +++ b/source_collectors/common_crawler/data/urls.csv @@ -0,0 +1,207 @@ +Index,Search Term,Keyword,Page,URL +CC-MAIN-2023-50,*.gov,police,2,https://acworth-ga.gov/administering-the-oath-of-office-to-a-newly-promoted-member-of-the-police-department/ +CC-MAIN-2023-50,*.gov,police,2,https://www.ada.gov/policevideo/policebroadbandgallery.htm +CC-MAIN-2023-50,*.gov,police,2,https://archive.ada.gov/franklintonpolice.htm +CC-MAIN-2023-50,*.gov,police,2,https://archive.ada.gov/illinois_state_police.htm +CC-MAIN-2023-50,*.gov,police,2,https://www.adamn.gov/p/other/police-department +CC-MAIN-2023-50,*.gov,police,2,https://www.adamscountypa.gov/police/earpd +CC-MAIN-2023-50,*.gov,police,2,https://www.aftonwyoming.gov/government/police_department/index.php +CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/community_relations.php +CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/community_relations.php +CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/crime_snapshot_statistics.php +CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/crime_snapshot_statistics.php +CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/index.php +CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/index.php +CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/investigative_subdivision.php +CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/investigative_subdivision.php +CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/procedures.php +CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/procedures.php +CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/recruiting/index.php +CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/recruiting/index.php +CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/services_subdivision.php +CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/services_subdivision.php +CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/transparency_hub.php +CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/transparency_hub.php +CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/uniform_subdivision.php +CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/uniform_subdivision.php +CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/zone_command.php +CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/zone_command.php +CC-MAIN-2023-50,*.gov,police,6,https://adeca.alabama.gov/2022/11/14/gov-ivey-announces-grant-to-help-auburn-police-deter-crime/ +CC-MAIN-2023-50,*.gov,police,7,https://governor.alabama.gov/newsroom/2020/02/kimberly-police-officer-nick-orear-flag-memo/ +CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/de/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ +CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ko/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ +CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ko/sales-use/police-jurisdictions/ +CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ru/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ +CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2015-police-jurisdiction-annexations-deannexations-ordinances/ +CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ +CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2023-police-jurisdiction-deannexations-ordinances-and-maps/ +CC-MAIN-2023-50,*.gov,police,8,https://tourism.alabama.gov/tag/world-police-and-fire-games/ +CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/content/public/departments/police-department/community_resources_apd.pdf +CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/content/public/v/237/departments/police-department/community_resources_apd.pdf +CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/sharedassets/public/alameda/police/policy-manual.pdf +CC-MAIN-2023-50,*.gov,police,8,http://alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf +CC-MAIN-2023-50,*.gov,police,8,https://alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf +CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf +CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/departments/police/ +CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/news/stories/peace-officers-memorial-day-and-national-police-week/ +CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/public-safety/police/police-blotter/ +CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/airport-police-fire.shtml +CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/airport-police-fire.shtml +CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/business/policefire/index.shtml +CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/business/policefire/jobs/ +CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/contact-police-fire.shtml +CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/contact-police-fire.shtml +CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police-fire-organization-chart.shtml +CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police-fire-organization-chart.shtml +CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police.shtml +CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/faiiap/police-fire/index.shtml +CC-MAIN-2023-50,*.gov,police,10,https://gov.alaska.gov/a-proclamation-on-honoring-united-states-capitol-police-officers/ +CC-MAIN-2023-50,*.gov,police,10,https://geohub.albanyga.gov/datasets/corrected-police-beat +CC-MAIN-2023-50,*.gov,police,10,https://data.albanyny.gov/browse?tags=police+report +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/departments/police/contact-the-albany-police-department +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/departments/police/programs/medication-and-sharps-disposal +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/hr/salary-schedules/police-table +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/apba/scholarship_packet.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/a18_alarm_user_permit_application.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/secondhand_dealer.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/Solicitor_License.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/neighborhood-watch/2013_nw_brochure-update.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/property/propertyinventoryrecord-fillable.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/child_safety_smartcard.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/facebook_smart_card.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/linkedln_smart_card.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/photosharingservices_smartcard.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/smartphone_smartcard.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/twitter_smart_card.pdf +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/ +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police +CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/accreditation +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/accreditation +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/administration +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/apd-policies +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/apd-policies +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/communications-section +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/communications-section +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/history +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/operations +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/operations +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/quarterly-report +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/quarterly-report +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/records-section +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd +CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/contact-apd +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/cold-cases +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis +CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/crime/statistics-crime-analysis +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/2dogs +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/2dogs +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/apbascholarship +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/apbascholarship +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/filing-a-complaint +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/filing-a-complaint +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/home-security-alarm-permits +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/home-security-alarm-permits +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/patch-requests +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/patch-requests +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/property-inventory-record +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/ride-along-requests +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/ride-along-requests +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/animal-control +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/apba +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/community-police-academy +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/medication-and-sharps-disposal +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out +CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/programs/national-night-out +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/neighborhood-speed-watch +CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/programs/neighborhood-speed-watch +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/neighborhood-watch-program +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/resources +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/resources +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safe-and-secure-seniors-independent +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safe-and-secure-seniors-independent +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safereturn +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safety-camp +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safety-camp +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/tow +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/tow +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/victim-assistance +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/victim-assistance +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/youthacademy +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/qrcode +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/robots.txt +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/bicycle-theft-prevention-and-safety +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/bicycle-theft-prevention-and-safety +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/child-safety +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/crime-prevention-through-environmental-design-cpted +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/crime-prevention-through-environmental-design-cpted +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/online-social-media-safety-tips +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/protecting-your-business +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/safe-exchange-zones +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/safety-on-the-road +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/vehicle +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/cadet-program +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/career-opportunities +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/lateral-officers +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/volunteer-program +CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/volunteer-program +CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-02-22/alexandria-police-department-makes-arrest-in-connection-to-shots-fired-incident +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-03-15/alexandria-police-department-apprehends-assault-suspect +CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-03-22/alexandria-police-officer-arrested +CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-03-25/alexandria-police-department-investigates-first-homicide-of-the-year +CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-04-18/don-hayes-appointed-alexandria-police-chief +CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-06-06/alexandria-police-makes-arrest-in-fatal-shooting +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-08-29/alexandria-police-department-investigates-serious-crash +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-12-21/alexandria-police-department-investigates-shooting-incident +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2023-09-29/apd-lt-graduates-from-dc-police-leadership-academy +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2023-11-17/apd-assists-fairfax-county-police-in-apprehension-of-suspect-driving-stolen +CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2023-11-17/apd-assists-fairfax-county-police-in-apprehension-of-suspect-driving-stolen +CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/ +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/community-police-academy +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/criminal-investigation-division +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/listing-page/apd-news-releases +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/office-of-the-police-chief +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/other-services +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/police-services +CC-MAIN-2023-50,*.gov,police,11,http://www3.alexandriava.gov/police/crime_reports/reporter.php +CC-MAIN-2023-50,*.gov,police,11,https://www3.alexandriava.gov/police/crime_reports/reporter.php +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police/info/default.aspx?id=112991 +CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/default.aspx?id=24274 +CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police/info/default.aspx?id=59358 +CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=27648 +CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=33624 +CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=68136 +CC-MAIN-2023-50,*.gov,police,11,https://wdc.alexandriava.gov/employment/special-police-officer-listing-3030.aspx +CC-MAIN-2023-50,*.gov,police,11,https://wdc.alexandriava.gov/employment/special-police-officer-listing-4122.aspx +CC-MAIN-2023-50,*.gov,police,11,https://aliquippapa.gov/events/light-up-night-at-the-aliquippa-police-station/ +CC-MAIN-2023-50,*.gov,police,11,https://www.almaarkansas.gov/police/ +CC-MAIN-2023-50,*.gov,police,11,https://www.almontmichigan.gov/departments/police-department/ +CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/contact-forms/departments/police/report-an-abandoned-vehicle-on-public-streets +CC-MAIN-2023-50,*.gov,police,11,https://www.altoonapa.gov/contacts/police/commander-of-criminal-investigation/lt-ashley-day +CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/animal-control +CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/directory +CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/services +CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/police-documents/ +CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/police-staff/ +CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/question/how-do-i-file-a-police-report-2/ +CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/question/who-do-i-call-about-police-related-non-emergencies-2/ +CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/topics/police-courts/ +CC-MAIN-2023-50,*.gov,police,11,http://police.amarillo.gov/robots.txt +CC-MAIN-2023-50,*.gov,police,11,http://police.amarillo.gov/robots.txt +CC-MAIN-2023-50,*.gov,police,11,https://share.america.gov/ar/heres-police-held-accountable-shooting-incidents-video/ diff --git a/source_collectors/common_crawler/main.py b/source_collectors/common_crawler/main.py new file mode 100644 index 00000000..ae27f556 --- /dev/null +++ b/source_collectors/common_crawler/main.py @@ -0,0 +1,328 @@ +import argparse +import collections +import dataclasses +import re +import sys +import os +from datetime import datetime + +from dotenv import load_dotenv + +# The below code sets the working directory to be the root of the entire repository +# This is done to solve otherwise quite annoying import issues. +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from util.huggingface_api_manager import HuggingFaceAPIManager +from util.miscellaneous_functions import get_filename_friendly_timestamp +from common_crawler.argparser import parse_args +from common_crawler.cache import CommonCrawlerCacheManager +from common_crawler.crawler import CommonCrawlerManager, CommonCrawlResult +from common_crawler.csv_manager import CSVManager +from label_studio_interface.LabelStudioConfig import LabelStudioConfig +from label_studio_interface.LabelStudioAPIManager import LabelStudioAPIManager + +""" +This module contains the main function for the Common Crawler script. +""" + + +@dataclasses.dataclass +class BatchInfo: + datetime: str + source: str + count: str + keywords: str + notes: str + filename: str + +class LabelStudioError(Exception): + """Custom exception for Label Studio Errors""" + pass + +BATCH_HEADERS = ['Datetime', 'Source', 'Count', 'Keywords', 'Notes', 'Filename'] + +def get_current_time(): + return str(datetime.now()) + + +def add_batch_info_to_csv(common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int) -> BatchInfo: + batch_info = BatchInfo( + datetime=get_current_time(), + source="Common Crawl", + count=str(len(common_crawl_result.url_results)), + keywords=f"{args.url} - {args.keyword}", + notes=f"{args.common_crawl_id}, {args.pages} pages, starting at {last_page + 1}", + filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}" + ) + + batch_info_csv_manager = CSVManager( + file_name='batch_info', + directory=args.data_dir, + headers=BATCH_HEADERS + ) + batch_info_csv_manager.add_row(dataclasses.astuple(batch_info)) + + return batch_info + + +def main(): + # Parse the arguments + args = parse_args() + + # Initialize the Cache + cache_manager = CommonCrawlerCacheManager( + file_name=args.cache_filename, + directory=args.data_dir + ) + + load_dotenv() + + # Initialize the HuggingFace API Manager + hf_access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN") + if not hf_access_token: + raise ValueError( + "HUGGINGFACE_ACCESS_TOKEN not accessible in .env file in root directory. " + "Please obtain access token from your personal account at " + "https://huggingface.co/settings/tokens and ensure you have write access to " + "https://huggingface.co/PDAP. Then include in .env file in root directory.") + huggingface_api_manager = HuggingFaceAPIManager( + access_token=hf_access_token, + repo_id=args.huggingface_repo_id + ) + ls_access_token = os.getenv("LABEL_STUDIO_ACCESS_TOKEN") + if not ls_access_token: + raise ValueError( + "LABEL_STUDIO_ACCESS_TOKEN not accessible in .env file in root directory. " + "Please obtain access token from your personal account at " + "https://app.heartex.com/user/account and ensure you have read access to " + "https://app.heartex.com/projects/61550. Then include in .env file in root directory.") + ls_project_id = os.getenv("LABEL_STUDIO_PROJECT_ID") + if not ls_project_id: + raise ValueError( + "LABEL_STUDIO_PROJECT_ID not accessible in .env file in root directory. " + "Please obtain a project ID by navigating to the Label Studio project " + "where it will be visibile in the url. Then include in .env file in root directory.") + + try: + print("Retrieving Label Studio data for deduplication") + label_studio_results = get_ls_data() + if label_studio_results is None: + raise LabelStudioError("Failed to retrieve Label Studio Data") + print("Label Studio data retrieved successfully") + except LabelStudioError as e: + print(e) + raise + + if args.reset_cache: + cache_manager.reset_cache() + + try: + # Retrieve the last page from the cache, or 0 if it does not exist + last_page = cache_manager.get(args.common_crawl_id, args.url, args.keyword) + common_crawl_result = process_crawl_and_upload(args, last_page, huggingface_api_manager, label_studio_results) + except ValueError as e: + print(f"Error during crawling: {e}") + return + + try: + cache_manager.upsert( + index=args.common_crawl_id, + url=args.url, + keyword=args.keyword, + last_page=common_crawl_result.last_page_search) + cache_manager.save_cache() + + except ValueError as e: + print(f"Error while saving cache manager: {e}") + +def handle_remote_results_error(remote_results): + """ + Handles errors in the remote results + + Args: remote_results (dict): The results from the label studio project + Raises: LabelStudioError: If an error is found in the remote results + """ + + status_code = remote_results.get("status_code") + if status_code == 401: + raise LabelStudioError("Invalid Label Studio token passed! Exiting...") + elif status_code == 404: + raise LabelStudioError("Invalid Label Studio Project ID! Exiting...") + else: + raise LabelStudioError(f"Unexpected error: {remote_results}") + +def validate_remote_results(remote_results): + """ + Validates the remote results retrieved from the Label Studio project + + Args: remote_results (dict or list): The results from the Label Studio project + + Returns: + list[dict]: If the remote results are valid + None: If the remote results are invalid + """ + if isinstance(remote_results, list): + if not remote_results: + print("No data in Label Studio project.") + return [] + elif "url" not in remote_results[0]["data"]: + raise LabelStudioError("Column 'url' not present in Label Studio project. Exiting...") + else: + return remote_results + elif isinstance(remote_results, dict): + handle_remote_results_error(remote_results) + else: + raise LabelStudioError("Unexpected response type.") + +def get_ls_data() -> list[dict] | None: + """Retrieves data from a Label Studio project to be used in deduplication of common crawl results. + + Returns: + list[dict] | None: Data from the Labels Studio project or None if the result is invalid. + """ + # Retrieve the data from the Labels Studio project + config = LabelStudioConfig() + api_manager = LabelStudioAPIManager(config) + response = api_manager.export_tasks_from_project(all_tasks=True) + remote_results = response.json() + + return validate_remote_results(remote_results) + + +def strip_url(url: str) -> str: + """Strips http(s)://www. from the beginning of a url if applicable. + + Args: + url (str): The URL to strip. + + Returns: + str: The stripped URL. + """ + result = re.search(r"^(?:https?://)?(?:www\.)?(.*)$", url).group(1) + return result + + +def remove_local_duplicates(url_results: list[str]) -> list[str]: + """Removes duplicate URLs from a list, ignoring http(s)://www. + + Args: + url_results (list[str]): List of URLs to deduplicate. + + Returns: + list[str]: List of unique URLs. + """ + stripped_url_results = [strip_url(url) for url in url_results] + unique_urls = collections.deque() + adjust = 0 + + for index, url in enumerate(stripped_url_results): + if url in unique_urls: + del url_results[index - adjust] + adjust += 1 + else: + unique_urls.appendleft(url) + + return url_results + + +def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dict]) -> list[str]: + """Removes URLs from a list that are already present in the Label Studio project, ignoring http(s)://www. + + Args: + url_results (list[str]): List of URLs to deduplicate. + label_studio_data (list[dict]): Label Studio project data to check for duplicates. + + Returns: + list[str]: List of remaining URLs not present in the Label Studio project. + """ + try: + remote_urls = [strip_url(task["data"]["url"]) for task in label_studio_data] + except TypeError: + print("Invalid Label Studio credentials. Database could not be checked for duplicates.") + return url_results + remote_urls = set(remote_urls) + + stripped_url_results = [strip_url(url) for url in url_results] + adjust = 0 + + for index, url in enumerate(stripped_url_results): + if url in remote_urls: + del url_results[index - adjust] + adjust += 1 + + return url_results + + +def handle_csv_and_upload( + common_crawl_result: CommonCrawlResult, + huggingface_api_manager: HuggingFaceAPIManager, + args: argparse.Namespace, + last_page: int): + """ + Handles the CSV file and uploads it to Hugging Face repository. + Args: + common_crawl_result: The result from Common Crawl. + huggingface_api_manager: The Hugging Face API manager. + args: The command-line arguments. + last_page: last page crawled + + """ + batch_info = add_batch_info_to_csv(common_crawl_result, args, last_page) + + csv_manager = CSVManager( + file_name=batch_info.filename, + headers=['url'], + directory=args.data_dir + ) + csv_manager.add_rows(common_crawl_result.url_results) + huggingface_api_manager.upload_file( + local_file_path=csv_manager.file_path, + repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}" + ) + print( + f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}") + csv_manager.delete_file() + + +def process_crawl_and_upload( + args: argparse.Namespace, + last_page: int, + huggingface_api_manager: HuggingFaceAPIManager, + label_studio_data: list[dict]) -> CommonCrawlResult: + # Initialize the CommonCrawlerManager + crawler_manager = CommonCrawlerManager( + args.common_crawl_id + ) + # Determine the pages to search, based on the last page searched + start_page = last_page + 1 + # Use the parsed arguments + common_crawl_result: CommonCrawlResult = crawler_manager.crawl( + search_term=args.url, + keyword=args.keyword, + num_pages=args.pages, + start_page=start_page + ) + # Logic should conclude here if no results are found + if not common_crawl_result.url_results: + print("No url results found. Ceasing main execution.") + add_batch_info_to_csv(common_crawl_result, args, last_page) + return common_crawl_result + + print("Removing urls already in the database") + common_crawl_result.url_results = remove_local_duplicates(common_crawl_result.url_results) + common_crawl_result.url_results = remove_remote_duplicates(common_crawl_result.url_results, label_studio_data) + if not common_crawl_result.url_results: + print("No urls not already present in the database found. Ceasing main execution.") + add_batch_info_to_csv(common_crawl_result, args, last_page) + return common_crawl_result + + handle_csv_and_upload(common_crawl_result, huggingface_api_manager, args, last_page) + + return common_crawl_result + + +if __name__ == "__main__": + # Example usage: python main.py CC-MAIN-2023-50 *.gov "police" + # Usage with optional arguments: python main.py CC-MAIN-2023-50 *.gov "police" -p 2 -o police_urls.txt + print("Running Common Crawler...") + main() diff --git a/source_collectors/common_crawler/requirements_common_crawler_action.txt b/source_collectors/common_crawler/requirements_common_crawler_action.txt new file mode 100644 index 00000000..22823fd0 --- /dev/null +++ b/source_collectors/common_crawler/requirements_common_crawler_action.txt @@ -0,0 +1,3 @@ +requests~=2.31.0 +python-dotenv~=1.0.1 +huggingface-hub~=0.22.2 \ No newline at end of file diff --git a/source_collectors/common_crawler/utils.py b/source_collectors/common_crawler/utils.py new file mode 100644 index 00000000..0848b023 --- /dev/null +++ b/source_collectors/common_crawler/utils.py @@ -0,0 +1,22 @@ +""" +This module contains utility functions for the common_crawler package +""" + + +class URLWithParameters: + """ + A class to handle URLs with parameters, allowing for easy addition of parameters + """ + + def __init__(self, url): + self.url = url + + def add_parameter(self, parameter, value): + if '?' in self.url: + self.url += f"&{parameter}={value}" + else: + self.url += f"?{parameter}={value}" + return self.url + + def __str__(self): + return self.url diff --git a/source_collectors/muckrock/.gitignore b/source_collectors/muckrock/.gitignore new file mode 100644 index 00000000..9ed70e69 --- /dev/null +++ b/source_collectors/muckrock/.gitignore @@ -0,0 +1,228 @@ +# Project specific +/Counties/Florida/Bay County/Scraper/attachments/* +/Counties/Florida/Bay County/Scraper/captcha/correct/* +/Counties/Florida/Bay County/Scraper/captcha/incorrect/* +/scrapers_library/CA/san_bernardino_county/data + +# Ignore dolt repos (cloned from ETL) +**/datasets +**/data-intake + +# Python gitignore from: https://github.com/github/gitignore/blob/master/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# Vim temp files +## swap +[._]*.s[a-v][a-z] +[._]*.sw[a-p] +[._]s[a-v][a-z] +[._]sw[a-p] +## session +Session.vim +## temporary +.netrwhist +*~ + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# IDE generated files +.idea + +# Emacs temp files +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +## Org-mode +.org-id-locations +*_archive +!incident_blotter_archive/ + +## flymake-mode +*_flymake.* + +## eshell files +/eshell/history +/eshell/lastdir + +## elpa packages +/elpa/ + +## reftex files +*.rel + +## AUCTeX auto folder +/auto/ + +## cask packages +.cask/ +dist/ + +## Flycheck +flycheck_*.el + +## server auth directory +/server/ + +## projectiles files +.projectile + +## directory configuration +.dir-locals.el + +.vscode +/.vscode + +*.db +*.json +*.csv +last_page_fetched.txt diff --git a/source_collectors/muckrock/README.md b/source_collectors/muckrock/README.md new file mode 100644 index 00000000..d74b77f0 --- /dev/null +++ b/source_collectors/muckrock/README.md @@ -0,0 +1,90 @@ +# Muckrock Toolkit + +## Description + +This repo provides tools for searching Muckrock FOIA requests, it includes scripts for downloading data from MuckRock, generating CSV files per PDAP database requirements, and automatic labeling + +## Installation + +### 1. Clone the `scrapers` repository and navigate to the `muckrock_tools` directory. + +``` +git clone git@github.com:Police-Data-Accessibility-Project/scrapers.git +cd scrapers/scrapers_library/data_portals/muckrock/muckrock_tools +``` + +### 2. Create a virtual environment. + +If you don't already have virtualenv, install the package: + +``` + +pip install virtualenv + +``` + +Then run the following command to create a virtual environment (ensure the python version is as below): + +``` + +virtualenv -p python3.12 venv + +``` + +### 3. Activate the virtual environment. + +``` + +source venv/bin/activate + +``` + +### 4. Install dependencies. + +``` + +pip install -r requirements.txt + +``` + +## Uses + +### 1. Simple Search Term + +- `muck_get.py` +- script to perform searches on MuckRock's database, by matching a search string to title of request. Search is slow due to rate limiting (cannot multi thread around it). + +### 2. Clone Muckrock database & search locally + +~~- `download_muckrock_foia.py` `search_local_foia_json.py`~~ (deprecated) + +- scripts to clone the MuckRock foia requests collection for fast local querying (total size <2GB at present) + +- `create_foia_data_db.py` creates and populates a SQLite database (`foia_data.db`) with all MuckRock foia requests. Various errors outside the scope of this script may occur; a counter (`last_page_fetched.txt`) is created to keep track of the most recent page fetched and inserted into the database. If the program exits prematurely, simply run `create_foia_data_db.py` again to continue where you left off. A log file is created to capture errors for later reference. + +- After `foia_data.db` is created, run `search_foia_data_db.py`, which receives a search string as input and outputs a JSON file with all related FOIA requests for later processing by `generate_detailed_muckrock_csv.py`. For example, + +``` +python3 create_foia_data_db.py + +python3 search_foia_data_db.py --search_for "use of force" +``` + +produces 'use_of_force.json'. + +### 3. County Level Search + +- `get_allegheny_foias.py`, `allegheny_county_towns.txt` +- To search for any and all requests in a certain county (e.g. Allegheny in this case) you must provide a list of all municipalities contained within the county. Muckrock stores geographic info in tiers, from Federal, State, and local level. At the local level, e.g. Pittsburgh and Allegheny County are in the same tier, with no way to determine which municipalities reside within a county (without providing it yourself). + +The `get_allegheny_foias.py` script will find the jurisdiction ID for each municipality in `allegheny_county_towns.txt`, then find all completed FOIA requests for those jurisdictions. + +### 4. Generate detailed FOIA data in PDAP database format + +- `generate_detailed_muckrock_csv.py` +- Once you have a json of relevant FOIA's, run it through this script to generate a CSV that fulfills PDAP database requirements. + +### 5. ML Labeling + +- `muckrock_ml_labeler.py` +- A tool for auto labeling MuckRock sources. This script is using [fine-url-classifier](https://huggingface.co/PDAP/fine-url-classifier) to assign 1 of 36 record type labels. At present, script is expecting each source to have associated header tags, provided via `html-tag-collector/collector.py`. (TODO: For muckrock sources, `collector.py` insufficient, does not grab main text of the request) diff --git a/source_collectors/muckrock/allegheny-county-towns.txt b/source_collectors/muckrock/allegheny-county-towns.txt new file mode 100644 index 00000000..4588e164 --- /dev/null +++ b/source_collectors/muckrock/allegheny-county-towns.txt @@ -0,0 +1,61 @@ +Allegheny County +Allison Park +Bairdford +Bakerstown +Bethel Park +Brackenridge +Braddock +Bradfordwoods +Bridgeville +Buena Vista +Bunola +Carnegie +Cheswick +Clairton +Coraopolis +Coulters +Creighton +Crescent +Cuddy +Curtisville +Dravosburg +Duquesne +East McKeesport +East Pittsburgh +Elizabeth +Gibsonia +Glassport +Glenshaw +Greenock +Harwick +Homestead +Imperial +Indianola +Ingomar +Leetsdale +McKees Rocks +Mckeesport +Monroeville +Morgan +Natrona Heights +North Versailles +Oakdale +Oakmont +Pitcairn +Pittsburgh +Presto +Rural Ridge +Russellton +Sewickley +South Park +Springdale +Sturgeon +Tarentum +Turtle Creek +Verona +Warrendale +West Elizabeth +West Mifflin +Wexford +Wildwood +Wilmerding diff --git a/source_collectors/muckrock/convert_all_record_types_to_csv.py b/source_collectors/muckrock/convert_all_record_types_to_csv.py new file mode 100644 index 00000000..be6d5364 --- /dev/null +++ b/source_collectors/muckrock/convert_all_record_types_to_csv.py @@ -0,0 +1,26 @@ +import subprocess +import os + +record_types = ['accident reports', 'arrest records', 'calls for service', 'car gps', 'citations', 'dispatch logs', 'dispatch recordings', + 'field contacts', 'incident reports', 'misc police activity', 'officer involved shootings', 'stops', 'surveys', 'use of force reports', + 'vehicle pursuits', 'complaints and misconduct', 'daily activity logs', 'training and hiring info', 'personnel records', 'annual and monthly reports', + 'budgets and finances', 'contact info and agency meta', 'geographic', 'list of data sources', 'policies and contracts', 'crime maps and reports', + 'crime statistics', 'media bulletins', 'records request info', 'resources', 'sex offender registry', 'wanted persons', 'booking reports', + 'court cases', 'incarceration records'] + +print(len(record_types)) +# json_files = [] + + +# for record_type in record_types: +# json_file = record_type.replace(' ', '_') + '.json' +# json_files.append(json_file) + +# for json_file in json_files: +# command = ['python', 'generate_detailed_muckrock_csv.py', +# '--json_file', json_file] + +# try: +# subprocess.run(command, check=True) +# except subprocess.CalledProcessError as e: +# print(f'An error occurred while processing "{json_file}": {e}') diff --git a/source_collectors/muckrock/create_foia_data_db.py b/source_collectors/muckrock/create_foia_data_db.py new file mode 100644 index 00000000..44801055 --- /dev/null +++ b/source_collectors/muckrock/create_foia_data_db.py @@ -0,0 +1,270 @@ +''' +create_foia_data_db.py + +This script fetches data from the MuckRock FOIA API and stores it in a SQLite database. +Run this prior to companion script `search_foia_data_db.py`. + +A successful run will output a SQLite database `foia_data.db` with one table `results`. +The database will contain all FOIA requests available through MuckRock. + +Functions: + - create_db() + - fetch_page() + - transform_page_data() + - populate_db() + - main() + +Error Handling: +Errors encountered during API requests or database operations are logged to an `errors.log` file +and/or printed to the console. +''' + + +import requests +import sqlite3 +import logging +import os +import json +import time +from typing import List, Tuple, Dict, Any, Union, Literal + +logging.basicConfig(filename='errors.log', level=logging.ERROR, + format='%(levelname)s: %(message)s') + + +base_url = 'https://www.muckrock.com/api_v1/foia/' +last_page_fetched = 'last_page_fetched.txt' + +NO_MORE_DATA = -1 # flag for program exit +JSON = Dict[str, Any] # type alias + + +create_table_query = ''' + CREATE TABLE IF NOT EXISTS results ( + id INTEGER PRIMARY KEY, + title TEXT, + slug TEXT, + status TEXT, + embargo_status TEXT, + user INTEGER, + username TEXT, + agency INTEGER, + datetime_submitted TEXT, + date_due TEXT, + days_until_due INTEGER, + date_followup TEXT, + datetime_done TEXT, + datetime_updated TEXT, + date_embargo TEXT, + tracking_id TEXT, + price TEXT, + disable_autofollowups BOOLEAN, + tags TEXT, + communications TEXT, + absolute_url TEXT + ) + ''' + + +foia_insert_query = ''' + INSERT INTO results (id, title, slug, status, embargo_status, user, username, agency, + datetime_submitted, date_due, days_until_due, date_followup, + datetime_done, datetime_updated, date_embargo, tracking_id, + price, disable_autofollowups, tags, communications, absolute_url) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''' + + +def create_db() -> bool: + ''' + Creates foia_data.db SQLite database with one table named `results`. + + Returns: + bool: True, if database is successfully created; False otherwise. + + Raises: + sqlite3.Error: If the table creation operation fails, prints error and returns False. + ''' + + try: + with sqlite3.connect('foia_data.db') as conn: + conn.execute(create_table_query) + conn.commit() + print('Successfully created foia_data.db!') + return True + except sqlite3.Error as e: + print(f'SQLite error: {e}.') + logging.error( + f'Failed to create foia_data.db due to SQLite error: {e}') + return False + + +def fetch_page(page: int) -> Union[JSON, Literal[NO_MORE_DATA], None]: + ''' + Fetches a page of 100 results from the MuckRock FOIA API. + + Args: + page (int): The page number to fetch from the API. + + Returns: + Union[JSON, None, Literal[NO_MORE_DATA]]: + - JSON Dict[str, Any]: The response's JSON data, if the request is successful. + - NO_MORE_DATA (int = -1): A constant, if there are no more pages to fetch (indicated by a 404 response). + - None: If there is an error other than 404. + ''' + + per_page = 100 + response = requests.get( + base_url, params={'page': page, 'page_size': per_page, 'format': 'json'}) + + if response.status_code == 200: + return response.json() + elif response.status_code == 404: + print('No more pages to fetch') + return NO_MORE_DATA # Typically 404 response will mean there are no more pages to fetch + elif 500 <= response.status_code < 600: + logging.error(f'Server error {response.status_code} on page {page}') + page = page + 1 + return fetch_page(page) + else: + print(f'Error fetching page {page}: {response.status_code}') + logging.error(f'Fetching page {page} failed with response code: { + response.status_code}') + return None + + +def transform_page_data(data_to_transform: JSON) -> List[Tuple[Any, ...]]: + ''' + Transforms the data recieved from the MuckRock FOIA API into a structured format for insertion into a database with `populate_db()`. + + Transforms JSON input into a list of tuples, as well as serializes the nested `tags` and `communications` fields into JSON strings. + + Args: + data_to_transform (JSON: Dict[str, Any]): The JSON data from the API response. + + Returns: + transformed_data (List[Tuple[Any, ...]]: A list of tuples, where each tuple contains the fields of a single FOIA request. + ''' + + transformed_data = [] + + for result in data_to_transform.get('results', []): + result['tags'] = json.dumps(result.get('tags', [])) + result['communications'] = json.dumps( + result.get('communications', [])) + + transformed_data.append(( + result['id'], + result['title'], + result['slug'], + result['status'], + result['embargo_status'], + result['user'], + result['username'], + result['agency'], + result['datetime_submitted'], + result['date_due'], + result['days_until_due'], + result['date_followup'], + result['datetime_done'], + result['datetime_updated'], + result['date_embargo'], + result['tracking_id'], + result['price'], + result['disable_autofollowups'], + result['tags'], + result['communications'], + result['absolute_url'] + )) + return transformed_data + + +def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None: + ''' + Populates foia_data.db SQLite database with the transfomed FOIA request data. + + Args: + transformed_data (List[Tuple[Any, ...]]): A list of tuples, where each tuple contains the fields of a single FOIA request. + page (int): The current page number for printing and logging errors. + + Returns: + None + + Raises: + sqlite3.Error: If the insertion operation fails, attempts to retry operation (max_retries = 2). If retries are + exhausted, logs error and exits. + ''' + + with sqlite3.connect('foia_data.db') as conn: + + retries = 0 + max_retries = 2 + while retries < max_retries: + try: + conn.executemany(foia_insert_query, transformed_data) + conn.commit() + print('Successfully inserted data!') + return + except sqlite3.Error as e: + print(f'SQLite error: {e}. Retrying...') + conn.rollback() + retries += 1 + time.sleep(1) + + if retries == max_retries: + print(f'Failed to insert data from page {page} after { + max_retries} attempts. Skipping to next page.') + logging.error(f'Failed to insert data from page {page} after { + max_retries} attempts.') + + +def main() -> None: + ''' + Main entry point for create_foia_data_db.py. + + This function orchestrates the process of fetching FOIA requests data from the MuckRock FOIA API, transforming it, + and storing it in a SQLite database. + ''' + + if not os.path.exists('foia_data.db'): + print('Creating foia_data.db...') + success = create_db() + if success == False: + print('Failed to create foia_data.db') + return + + if os.path.exists(last_page_fetched): + with open(last_page_fetched, mode='r') as file: + page = int(file.read()) + 1 + else: + page = 1 + + while True: + + print(f'Fetching page {page}...') + page_data = fetch_page(page) + + if page_data == NO_MORE_DATA: + break # Exit program because no more data exixts + if page_data is None: + print(f'Skipping page {page}...') + page += 1 + continue + + transformed_data = transform_page_data(page_data) + + populate_db(transformed_data, page) + + with open(last_page_fetched, mode='w') as file: + file.write(str(page)) + page += 1 + + print('create_foia_data_db.py run finished') + + +if __name__ == '__main__': + try: + main() + except Exception as e: + logging.error(f'An unexpected error occurred: {e}') + print('Check errors.log to review errors. Run create_foia_data_db.py again to continue') diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py new file mode 100644 index 00000000..c1a0380f --- /dev/null +++ b/source_collectors/muckrock/download_muckrock_foia.py @@ -0,0 +1,43 @@ +import requests +import csv +import time +import json + +# Define the base API endpoint +base_url = "https://www.muckrock.com/api_v1/foia/" + +# Set initial parameters +page = 1 +per_page = 100 +all_data = [] +output_file = "foia_data.json" + +# Function to fetch data from a specific page +def fetch_page(page): + response = requests.get(base_url, params={"page": page, "page_size": per_page, "format": "json"}) + if response.status_code == 200: + return response.json() + else: + print(f"Error fetching page {page}: {response.status_code}") + return None + +# Fetch and store data from all pages +while True: + print(f"Fetching page {page}...") + data = fetch_page(page) + if data is None: + print(f"Skipping page {page}...") + page += 1 + continue + + all_data.extend(data['results']) + if not data['next']: + break + + page += 1 + +# Write data to CSV +with open(output_file, mode='w', encoding='utf-8') as json_file: + json.dump(all_data, json_file, indent=4) + +print(f"Data written to {output_file}") diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py new file mode 100644 index 00000000..4d57737d --- /dev/null +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -0,0 +1,152 @@ +import json +import argparse +import csv +import requests +import time +from utils import format_filename_json_to_csv + +# Load the JSON data +parser = argparse.ArgumentParser(description="Parse JSON from a file.") +parser.add_argument('--json_file', type=str, required=True, + help="Path to the JSON file") + +args = parser.parse_args() + +with open(args.json_file, 'r') as f: + json_data = json.load(f) + +# Define the CSV headers +headers = [ + "name", "agency_described", "record_type", "description", "source_url", + "readme_url", "scraper_url", "state", "county", "municipality", + "agency_type", "jurisdiction_type", "View Archive", "agency_aggregation", + "agency_supplied", "supplying_entity", "agency_originated", "originating_agency", + "coverage_start", "source_last_updated", "coverage_end", "number_of_records_available", + "size", "access_type", "data_portal_type", "access_notes", "record_format", "update_frequency", + "update_method", "retention_schedule", "detail_level" +] + + +def get_agency(agency_id): + # API call to get agency_described + if agency_id: + agency_url = f"https://www.muckrock.com/api_v1/agency/{agency_id}/" + response = requests.get(agency_url) + + if response.status_code == 200: + agency_data = response.json() + return agency_data + else: + return "" + else: + print("Agency ID not found in item") + + +def get_jurisdiction(jurisdiction_id): + if jurisdiction_id: + jurisdiction_url = f"https://www.muckrock.com/api_v1/jurisdiction/{ + jurisdiction_id}/" + response = requests.get(jurisdiction_url) + + if response.status_code == 200: + jurisdiction_data = response.json() + return jurisdiction_data + else: + return "" + else: + print("Jurisdiction ID not found in item") + + +output_csv = format_filename_json_to_csv(args.json_file) +# Open a CSV file for writing +with open(output_csv, 'w', newline='') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=headers) + + # Write the header row + writer.writeheader() + + # Iterate through the JSON data + for item in json_data: + print(f"Writing data for {item.get('title')}") + agency_data = get_agency(item.get("agency")) + time.sleep(1) + jurisdiction_data = get_jurisdiction(agency_data.get("jurisdiction")) + + jurisdiction_level = jurisdiction_data.get("level") + # federal jurisduction level + if jurisdiction_level == "f": + state = "" + county = "" + municipality = "" + juris_type = "federal" + # state jurisdiction level + if jurisdiction_level == "s": + state = jurisdiction_data.get("name") + county = "" + municipality = "" + juris_type = "state" + # local jurisdiction level + if jurisdiction_level == "l": + parent_juris_data = get_jurisdiction( + jurisdiction_data.get("parent")) + state = parent_juris_data.get("abbrev") + if "County" in jurisdiction_data.get("name"): + county = jurisdiction_data.get("name") + municipality = "" + juris_type = "county" + else: + county = "" + municipality = jurisdiction_data.get("name") + juris_type = "local" + + if 'Police' in agency_data.get("types"): + agency_type = 'law enforcement/police' + else: + agency_type = '' + + source_url = '' + absolute_url = item.get("absolute_url") + access_type = '' + for comm in item["communications"]: + if comm["files"]: + source_url = absolute_url + '#files' + access_type = 'Web page,Download,API' + break + + # Extract the relevant fields from the JSON object + csv_row = { + "name": item.get("title", ""), + "agency_described": agency_data.get("name", "") + ' - ' + state, + "record_type": "", + "description": "", + "source_url": source_url, + "readme_url": absolute_url, + "scraper_url": "", + "state": state, + "county": county, + "municipality": municipality, + "agency_type": agency_type, + "jurisdiction_type": juris_type, + "View Archive": "", + "agency_aggregation": "", + "agency_supplied": "no", + "supplying_entity": "MuckRock", + "agency_originated": "yes", + "originating_agency": agency_data.get("name", ""), + "coverage_start": "", + "source_last_updated": "", + "coverage_end": "", + "number_of_records_available": "", + "size": "", + "access_type": access_type, + "data_portal_type": "MuckRock", + "access_notes": "", + "record_format": "", + "update_frequency": "", + "update_method": "", + "retention_schedule": "", + "detail_level": "" + } + + # Write the extracted row to the CSV file + writer.writerow(csv_row) diff --git a/source_collectors/muckrock/get_all_record_types.py b/source_collectors/muckrock/get_all_record_types.py new file mode 100644 index 00000000..bcc8c0b7 --- /dev/null +++ b/source_collectors/muckrock/get_all_record_types.py @@ -0,0 +1,17 @@ +import subprocess + +record_types = ['accident reports', 'arrest records', 'calls for service', 'car gps', 'citations', 'dispatch logs', 'dispatch recordings', + 'field contacts', 'incident reports', 'misc police activity', 'officer involved shootings', 'stops', 'surveys', 'use of force reports', + 'vehicle pursuits', 'complaints and misconduct', 'daily activity logs', 'training and hiring info', 'personnel records', 'annual and monthly reports', + 'budgets and finances', 'contact info and agency meta', 'geographic', 'list of data sources', 'policies and contracts', 'crime maps and reports', + 'crime statistics', 'media bulletins', 'records request info', 'resources', 'sex offender registry', 'wanted persons', 'booking reports', + 'court cases', 'incarceration records'] + +for record_type in record_types: + command = ['python', 'search_foia_data_db.py', '--search_for', record_type] + + try: + subprocess.run(command, check=True) + except subprocess.CalledProcessError as e: + print(f'An error occurred while executing the command for "{ + record_type}": {e}') diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py new file mode 100644 index 00000000..96cde838 --- /dev/null +++ b/source_collectors/muckrock/get_allegheny_foias.py @@ -0,0 +1,74 @@ +import requests +import json +import time + +# Function to fetch jurisdiction IDs based on town names from a text file +def fetch_jurisdiction_ids(town_file, base_url): + with open(town_file, "r") as file: + town_names = [line.strip() for line in file] + + jurisdiction_ids = {} + url = base_url + + while url: + response = requests.get(url) + if response.status_code == 200: + data = response.json() + for item in data.get('results', []): + if item['name'] in town_names: + jurisdiction_ids[item['name']] = item['id'] + + url = data.get("next") + print(f"Processed page, found {len(jurisdiction_ids)}/{len(town_names)} jurisdictions so far...") + time.sleep(1) # To respect the rate limit + + elif response.status_code == 503: + print("Error 503: Skipping page") + break + else: + print(f"Error fetching data: {response.status_code}") + break + + return jurisdiction_ids + +# Function to fetch FOIA data for each jurisdiction ID and save it to a JSON file +def fetch_foia_data(jurisdiction_ids): + all_data = [] + for name, id_ in jurisdiction_ids.items(): + url = f"https://www.muckrock.com/api_v1/foia/?status=done&jurisdiction={id_}" + while url: + response = requests.get(url) + if response.status_code == 200: + data = response.json() + all_data.extend(data.get("results", [])) + url = data.get("next") + print(f"Fetching records for {name}, {len(all_data)} total records so far...") + time.sleep(1) # To respect the rate limit + elif response.status_code == 503: + print(f"Error 503: Skipping page for {name}") + break + else: + print(f"Error fetching data: {response.status_code} for {name}") + break + + # Save the combined data to a JSON file + with open("foia_data_combined.json", "w") as json_file: + json.dump(all_data, json_file, indent=4) + + print(f"Saved {len(all_data)} records to foia_data_combined.json") + +# Main function to execute the script +def main(): + town_file = "allegheny-county-towns.txt" + jurisdiction_url = "https://www.muckrock.com/api_v1/jurisdiction/?level=l&parent=126" + + # Fetch jurisdiction IDs based on town names + jurisdiction_ids = fetch_jurisdiction_ids(town_file, jurisdiction_url) + print(f"Jurisdiction IDs fetched: {jurisdiction_ids}") + + # Fetch FOIA data for each jurisdiction ID + fetch_foia_data(jurisdiction_ids) + +# Run the main function +if __name__ == "__main__": + main() diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py new file mode 100644 index 00000000..ed1db454 --- /dev/null +++ b/source_collectors/muckrock/muck_get.py @@ -0,0 +1,50 @@ +import requests +import json + +# Define the base API endpoint +base_url = "https://www.muckrock.com/api_v1/foia/" + +# Define the search string +search_string = "use of force" +per_page = 100 +page = 1 +all_results = [] +max_count = 20 + +while True: + + # Make the GET request with the search string as a query parameter + response = requests.get(base_url, params={"page" : page, "page_size" : per_page, "format": "json"}) + + # Check if the request was successful + if response.status_code == 200: + # Parse the JSON response + data = response.json() + + if not data['results']: + break + + filtered_results = [item for item in data['results'] if search_string.lower() in item['title'].lower()] + + all_results.extend(filtered_results) + + if len(filtered_results) > 0: + num_results = len(filtered_results) + print(f"found {num_results} more matching result(s)...") + + if len(all_results) >= max_count: + print("max count reached... exiting") + break + + page += 1 + + else: + print(f"Error: {response.status_code}") + break + +# Dump list into a JSON file +json_out_file = search_string.replace(" ", "_") + ".json" +with open(json_out_file, 'w') as json_file: + json.dump(all_results, json_file) + +print(f"List dumped into {json_out_file}") diff --git a/source_collectors/muckrock/muckrock_ml_labeler.py b/source_collectors/muckrock/muckrock_ml_labeler.py new file mode 100644 index 00000000..dafd6de2 --- /dev/null +++ b/source_collectors/muckrock/muckrock_ml_labeler.py @@ -0,0 +1,41 @@ +from transformers import AutoTokenizer, AutoModelForSequenceClassification +import torch +import pandas as pd +import argparse + +# Load the tokenizer and model +model_name = "PDAP/fine-url-classifier" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForSequenceClassification.from_pretrained(model_name) +model.eval() + +# Load the dataset from command line argument +parser = argparse.ArgumentParser(description="Load CSV file into a pandas DataFrame.") +parser.add_argument('--csv_file', type=str, required=True, help="Path to the CSV file") +args = parser.parse_args() +df = pd.read_csv(args.csv_file) + +# Combine multiple columns (e.g., 'url', 'html_title', 'h1') into a single text field for each row +columns_to_combine = ['url_path', 'html_title', 'h1'] # Add other columns here as needed +df['combined_text'] = df[columns_to_combine].apply(lambda row: ' '.join(row.values.astype(str)), axis=1) + +# Convert the combined text into a list +texts = df['combined_text'].tolist() + +# Tokenize the inputs +inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") + +# Perform inference +with torch.no_grad(): + outputs = model(**inputs) + +# Get the predicted labels +predictions = torch.argmax(outputs.logits, dim=-1) + +# Map predictions to labels +labels = model.config.id2label +predicted_labels = [labels[int(pred)] for pred in predictions] + +# Add the predicted labels to the dataframe and save +df['predicted_label'] = predicted_labels +df.to_csv("labeled_muckrock_dataset.csv", index=False) diff --git a/source_collectors/muckrock/requirements.txt b/source_collectors/muckrock/requirements.txt new file mode 100644 index 00000000..babb4f3e --- /dev/null +++ b/source_collectors/muckrock/requirements.txt @@ -0,0 +1,30 @@ +certifi==2024.8.30 +charset-normalizer==3.4.0 +filelock==3.16.1 +fsspec==2024.10.0 +huggingface-hub==0.26.1 +idna==3.10 +Jinja2==3.1.4 +logging==0.4.9.6 +MarkupSafe==3.0.2 +mpmath==1.3.0 +networkx==3.4.2 +numpy==2.1.2 +packaging==24.1 +pandas==2.2.3 +python-dateutil==2.9.0.post0 +pytz==2024.2 +PyYAML==6.0.2 +regex==2024.9.11 +requests==2.32.3 +safetensors==0.4.5 +setuptools==75.2.0 +six==1.16.0 +sympy==1.13.1 +tokenizers==0.20.1 +torch==2.5.0 +tqdm==4.66.5 +transformers==4.46.0 +typing_extensions==4.12.2 +tzdata==2024.2 +urllib3==2.2.3 diff --git a/source_collectors/muckrock/search_foia_data_db.py b/source_collectors/muckrock/search_foia_data_db.py new file mode 100644 index 00000000..ff9aac68 --- /dev/null +++ b/source_collectors/muckrock/search_foia_data_db.py @@ -0,0 +1,181 @@ +''' +search_foia_data_db.py + +This script provides search functionality for the `foia_data.db` SQLite database. The search looks in `title`s and +`tags` of FOIA requests that match an input string provided by the user. +Run this after companion script `create_foia_data_db.py`. + +A successful run will output a JSON file containing entries matching the search string. + +Functions: + - parser_init() + - search_foia_db() + - parse_communications_column() + - generate_json() + - main() + +Error Handling: +Errors encountered during database operations, JSON parsing, or file writing are printed to the console. +''' + + +import sqlite3 +import pandas as pd +import json +import argparse +import os +from typing import Union, List, Dict + +check_results_table_query = ''' + SELECT name FROM sqlite_master + WHERE (type = 'table') + AND (name = 'results') + ''' + +search_foia_query = ''' + SELECT * FROM results + WHERE (title LIKE ? OR tags LIKE ?) + AND (status = 'done') + ''' + + +def parser_init() -> argparse.ArgumentParser: + ''' + Initializes the argument parser for search_foia_data_db.py. + + Returns: + argparse.ArgumentParser: The configured argument parser. + ''' + + parser = argparse.ArgumentParser( + description='Search foia_data.db and generate a JSON file of resulting matches') + parser.add_argument('--search_for', type=str, required=True, metavar='', + help='Provide a string to search foia_data.db') + + return parser + + +def search_foia_db(search_string: str) -> Union[pd.DataFrame, None]: + ''' + Searches the foia_data.db database for FOIA request entries matching the provided search string. + + Args: + search_string (str): The string to search for in the `title` and `tags` of the `results` table. + + Returns: + Union[pandas.DataFrame, None]: + - pandas.DataFrame: A DataFrame containing the matching entries from the database. + - None: If an error occurs during the database operation. + + Raises: + sqlite3.Error: If any database operation fails, prints error and returns None. + Exception: If any unexpected error occurs, prints error and returns None. + ''' + + print(f'Searching foia_data.db for "{search_string}"...') + + try: + with sqlite3.connect('foia_data.db') as conn: + + results_table = pd.read_sql_query(check_results_table_query, conn) + + if results_table.empty: + print('The `results` table does not exist in the database.') + return None + + params = [f'%{search_string}%', f'%{search_string}%'] + + df = pd.read_sql_query(search_foia_query, conn, params=params) + + except sqlite3.Error as e: + print(f'Sqlite error: {e}') + return None + except Exception as e: + print(f'An unexpected error occurred: {e}') + return None + + return df + + +def parse_communications_column(communications) -> List[Dict]: + ''' + Parses a communications column value, decoding it from JSON format. + + Args: + communications : The input value to be parsed, which can be a JSON string or NaN. + + Returns: + list (List[Dict]): A list containing the parsed JSON data. If the input is NaN (missing values) or + there is a JSON decoding error, an empty list is returned. + + Raises: + json.JSONDecodeError: If deserialization fails, prints error and returns empty list. + ''' + + if pd.isna(communications): + return [] + try: + return json.loads(communications) + except json.JSONDecodeError as e: + print(f'Error decoding JSON: {e}') + return [] + + +def generate_json(df: pd.DataFrame, search_string: str) -> None: + ''' + Generates a JSON file from a pandas DataFrame. + + Args: + df (pandas.DataFrame): The DataFrame containing the data to be written to the JSON file. + + search_string (str): The string used to name the output JSON file. Spaces in the string + are replaced with underscores. + + Returns: + None + + Raises: + Exception: If writing to JSON file operation fails, prints error and returns. + ''' + + output_json = f'{search_string.replace(' ', '_')}.json' + + try: + df.to_json(output_json, orient='records', indent=4) + print(f'Matching entries written to "{output_json}"') + except Exception as e: + print(f'An error occurred while writing JSON: {e}') + + +def main() -> None: + ''' + Function to search the foia_data.db database for entries matching a specified search string. + + Command Line Args: + --search_for (str): A string to search for in the `title` and `tags` fields of FOIA requests. + ''' + + parser = parser_init() + args = parser.parse_args() + search_string = args.search_for + + if not os.path.exists('foia_data.db'): + print('foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it.') + return + + df = search_foia_db(search_string) + if df is None: + return + + if not df['communications'].empty: + df['communications'] = df['communications'].apply( + parse_communications_column) + + print(f'Found {df.shape[0]} matching entries containing "{ + search_string}" in the title or tags') + + generate_json(df, search_string) + + +if __name__ == '__main__': + main() diff --git a/source_collectors/muckrock/search_local_foia_json.py b/source_collectors/muckrock/search_local_foia_json.py new file mode 100644 index 00000000..9e61d49c --- /dev/null +++ b/source_collectors/muckrock/search_local_foia_json.py @@ -0,0 +1,38 @@ +import json + +# Specify the JSON file path +json_file = 'foia_data.json' +search_string = 'use of force' + +# Load the JSON data +with open(json_file, 'r', encoding='utf-8') as file: + data = json.load(file) + +# List to store matching entries +matching_entries = [] + +# Function to search within an entry +def search_entry(entry): + # Check if 'status' is 'done' + if entry.get('status') != 'done': + return False + + # Check if 'title' or 'tags' field contains the search string + title_match = 'title' in entry and search_string.lower() in entry['title'].lower() + tags_match = 'tags' in entry and any(search_string.lower() in tag.lower() for tag in entry['tags']) + + return title_match or tags_match + +# Iterate through the data and collect matching entries +for entry in data: + if search_entry(entry): + matching_entries.append(entry) + +# Output the results +print(f"Found {len(matching_entries)} entries containing '{search_string}' in the title or tags.") + +# Optionally, write matching entries to a new JSON file +with open('matching_entries.json', 'w', encoding='utf-8') as file: + json.dump(matching_entries, file, indent=4) + +print(f"Matching entries written to 'matching_entries.json'") diff --git a/source_collectors/muckrock/utils.py b/source_collectors/muckrock/utils.py new file mode 100644 index 00000000..ca66dc8c --- /dev/null +++ b/source_collectors/muckrock/utils.py @@ -0,0 +1,27 @@ +''' +utils.py + +Provides useful functions for muckrock_tools. + +Functions: + - format_filename_json_to_csv() +''' + + +import re + + +def format_filename_json_to_csv(json_filename: str) -> str: + ''' + Converts JSON filename format to CSV filename format. + + Args: + json_file (str): A JSON filename string. + + Returns: + csv_filename (str): A CSV filename string. + + ''' + csv_filename = re.sub(r'_(?=[^.]*$)', '-', json_filename[:-5]) + '.csv' + + return csv_filename From cdbae207cad7b263aac6664e75476a9ae7beba57 Mon Sep 17 00:00:00 2001 From: eddie-m-m Date: Fri, 15 Nov 2024 16:19:44 -0800 Subject: [PATCH 02/11] --amend --- source_collectors/muckrock/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/source_collectors/muckrock/.gitignore b/source_collectors/muckrock/.gitignore index 9ed70e69..3ad8c498 100644 --- a/source_collectors/muckrock/.gitignore +++ b/source_collectors/muckrock/.gitignore @@ -225,4 +225,5 @@ flycheck_*.el *.db *.json *.csv +/csv last_page_fetched.txt From 9d9618aa0d2ac7fb58f7307ac55e936fd661522c Mon Sep 17 00:00:00 2001 From: eddie-m-m Date: Fri, 15 Nov 2024 17:06:58 -0800 Subject: [PATCH 03/11] Lint added files --- common_crawler/argparser.py | 43 +++-- common_crawler/cache.py | 16 +- common_crawler/crawler.py | 41 +++-- common_crawler/csv_manager.py | 23 +-- common_crawler/main.py | 108 +++++++----- common_crawler/utils.py | 2 +- .../ckan/ckan_scraper_toolkit.py | 14 +- .../ckan/scrape_ckan_data_portals.py | 1 + source_collectors/ckan/search_terms.py | 2 +- source_collectors/common_crawler/argparser.py | 43 +++-- source_collectors/common_crawler/cache.py | 16 +- source_collectors/common_crawler/crawler.py | 41 +++-- .../common_crawler/csv_manager.py | 23 +-- source_collectors/common_crawler/main.py | 108 +++++++----- source_collectors/common_crawler/utils.py | 2 +- .../convert_all_record_types_to_csv.py | 47 ++++- .../muckrock/create_foia_data_db.py | 165 +++++++++--------- .../muckrock/download_muckrock_foia.py | 12 +- .../generate_detailed_muckrock_csv.py | 68 +++++--- .../muckrock/get_all_record_types.py | 51 +++++- .../muckrock/get_allegheny_foias.py | 22 ++- source_collectors/muckrock/muck_get.py | 14 +- .../muckrock/muckrock_ml_labeler.py | 16 +- .../muckrock/search_foia_data_db.py | 82 +++++---- .../muckrock/search_local_foia_json.py | 26 +-- source_collectors/muckrock/utils.py | 11 +- 26 files changed, 616 insertions(+), 381 deletions(-) diff --git a/common_crawler/argparser.py b/common_crawler/argparser.py index 8cdf5b78..67f4a290 100644 --- a/common_crawler/argparser.py +++ b/common_crawler/argparser.py @@ -7,6 +7,7 @@ for the Common Crawler script. """ + def valid_common_crawl_id(common_crawl_id: str) -> bool: """ Validate the Common Crawl ID format. @@ -16,7 +17,8 @@ def valid_common_crawl_id(common_crawl_id: str) -> bool: Returns: True if the Common Crawl ID is valid, False otherwise """ - return re.match(r'CC-MAIN-\d{4}-\d{2}', common_crawl_id) is not None + return re.match(r"CC-MAIN-\d{4}-\d{2}", common_crawl_id) is not None + def parse_args() -> argparse.Namespace: """ @@ -33,22 +35,41 @@ def parse_args() -> argparse.Namespace: """ parser = argparse.ArgumentParser( - description='Query the Common Crawl dataset and optionally save the results to a file.') + description="Query the Common Crawl dataset and optionally save the results to a file." + ) # Add the required arguments - parser.add_argument('common_crawl_id', type=str, help='The Common Crawl ID') - parser.add_argument('url', type=str, help='The URL to query') - parser.add_argument('keyword', type=str, help='The keyword to search in the url') + parser.add_argument("common_crawl_id", type=str, help="The Common Crawl ID") + parser.add_argument("url", type=str, help="The URL to query") + parser.add_argument("keyword", type=str, help="The keyword to search in the url") # Optional arguments for the number of pages and the output file, and a flag to reset the cache - parser.add_argument('-c', '--config', type=str, default='config.ini', help='The configuration file to use') - parser.add_argument('-p', '--pages', type=int, default=1, help='The number of pages to search (default: 1)') - parser.add_argument('--reset-cache', action='store_true', default=False, - help='Reset the cache before starting the crawl') + parser.add_argument( + "-c", + "--config", + type=str, + default="config.ini", + help="The configuration file to use", + ) + parser.add_argument( + "-p", + "--pages", + type=int, + default=1, + help="The number of pages to search (default: 1)", + ) + parser.add_argument( + "--reset-cache", + action="store_true", + default=False, + help="Reset the cache before starting the crawl", + ) args = parser.parse_args() # Validate the Common Crawl ID format if not valid_common_crawl_id(args.common_crawl_id): - parser.error("Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW.") + parser.error( + "Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW." + ) # Read the configuration file config = configparser.ConfigParser() @@ -56,7 +77,7 @@ def parse_args() -> argparse.Namespace: # Combine parsed arguments with configuration file defaults app_parser = argparse.ArgumentParser(parents=[parser], add_help=False) - app_parser.set_defaults(**config['DEFAULT']) + app_parser.set_defaults(**config["DEFAULT"]) app_args = app_parser.parse_args() diff --git a/common_crawler/cache.py b/common_crawler/cache.py index 2a48c0b7..23d58819 100644 --- a/common_crawler/cache.py +++ b/common_crawler/cache.py @@ -8,11 +8,13 @@ - CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results """ + class CommonCrawlerCacheManager: """ A class for managing the cache of Common Crawl search results. This class is responsible for adding, retrieving, and saving cache data. """ + def __init__(self, file_name: str = "cache", directory=None): """ Initializes the CacheStorage object with a file name and directory. @@ -41,7 +43,6 @@ def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None: self.cache[index][url] = {} self.cache[index][url][keyword] = last_page - def get(self, index, url, keyword) -> int: """ Retrieves a page number from the cache. @@ -53,12 +54,15 @@ def get(self, index, url, keyword) -> int: Returns: int - the last page crawled """ - if index in self.cache and url in self.cache[index] and keyword in self.cache[index][url]: + if ( + index in self.cache + and url in self.cache[index] + and keyword in self.cache[index][url] + ): return self.cache[index][url][keyword] # The cache object does not exist. Return 0 as the default value. return 0 - def load_or_create_cache(self) -> dict: """ Loads the cache from the configured file path. @@ -66,12 +70,11 @@ def load_or_create_cache(self) -> dict: Returns: dict - the cache data """ try: - with open(self.file_path, 'r') as file: + with open(self.file_path, "r") as file: return json.load(file) except FileNotFoundError: return {} - def save_cache(self) -> None: """ Converts the cache object into a JSON-serializable format and saves it to the configured file path. @@ -79,10 +82,9 @@ def save_cache(self) -> None: persistence of crawl data across sessions. """ # Reformat cache data for JSON serialization - with open(self.file_path, 'w') as file: + with open(self.file_path, "w") as file: json.dump(self.cache, file, indent=4) - def reset_cache(self) -> None: """ Resets the cache to an empty state. diff --git a/common_crawler/crawler.py b/common_crawler/crawler.py index 9afba7d8..0982ca53 100644 --- a/common_crawler/crawler.py +++ b/common_crawler/crawler.py @@ -16,7 +16,6 @@ # TODO: What happens when no results are found? How does the CommonCrawlerManager handle this? - @dataclass class CommonCrawlResult: last_page_search: int @@ -31,16 +30,17 @@ class CommonCrawlerManager: It validates crawl ids, manages pagination, and aggregates results. """ - def __init__(self, crawl_id='CC-MAIN-2023-50'): + def __init__(self, crawl_id="CC-MAIN-2023-50"): self.crawl_id = crawl_id - CC_INDEX_SERVER = 'http://index.commoncrawl.org/' - INDEX_NAME = f'{self.crawl_id}-index' - self.root_url = f'{CC_INDEX_SERVER}{INDEX_NAME}' + CC_INDEX_SERVER = "http://index.commoncrawl.org/" + INDEX_NAME = f"{self.crawl_id}-index" + self.root_url = f"{CC_INDEX_SERVER}{INDEX_NAME}" def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResult: print( f"Searching for {keyword} on {search_term} in {self.crawl_id} for {num_pages} pages," - f" starting at page {start_page}") + f" starting at page {start_page}" + ) url_results = [] @@ -64,7 +64,9 @@ def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResul return CommonCrawlResult(last_page, url_results) - def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = 20) -> list[dict]: + def search_common_crawl_index( + self, url: str, page: int = 0, max_retries: int = 20 + ) -> list[dict]: """ This method is used to search the Common Crawl index for a given URL and page number Args: @@ -76,9 +78,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = """ encoded_url = quote_plus(url) search_url = URLWithParameters(self.root_url) - search_url.add_parameter('url', encoded_url) - search_url.add_parameter('output', 'json') - search_url.add_parameter('page', page) + search_url.add_parameter("url", encoded_url) + search_url.add_parameter("output", "json") + search_url.add_parameter("page", page) retries = 0 delay = 1 @@ -90,7 +92,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = return self.process_response(response, url, page) retries += 1 - print(f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})") + print( + f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})" + ) time.sleep(delay) print(f"Max retries exceeded. Failed to get records for {url} on page {page}.") @@ -106,19 +110,24 @@ def make_request(self, search_url: str) -> requests.Response: response.raise_for_status() return response except requests.exceptions.RequestException as e: - if response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR and 'SlowDown' in response.text: + if ( + response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR + and "SlowDown" in response.text + ): return None else: print(f"Failed to get records: {e}") return None - def process_response(self, response: requests.Response, url: str, page: int) -> list[dict]: + def process_response( + self, response: requests.Response, url: str, page: int + ) -> list[dict]: """Processes the HTTP response and returns the parsed records if successful.""" if response.status_code == HTTPStatus.OK: - records = response.text.strip().split('\n') + records = response.text.strip().split("\n") print(f"Found {len(records)} records for {url} on page {page}") return [json.loads(record) for record in records] - elif 'First Page is 0, Last Page is 0' in response.text: + elif "First Page is 0, Last Page is 0" in response.text: print("No records exist in index matching the url search term") return None else: @@ -127,4 +136,4 @@ def process_response(self, response: requests.Response, url: str, page: int) -> @staticmethod def get_urls_with_keyword(records: list[dict], keyword) -> list[str]: - return [record['url'] for record in records if keyword in record['url']] + return [record["url"] for record in records if keyword in record["url"]] diff --git a/common_crawler/csv_manager.py b/common_crawler/csv_manager.py index 69868629..2b823b42 100644 --- a/common_crawler/csv_manager.py +++ b/common_crawler/csv_manager.py @@ -10,12 +10,7 @@ class CSVManager: Creates the file if it doesn't exist, and provides a method for adding new rows. """ - def __init__( - self, - file_name: str, - headers: list[str], - directory=None - ): + def __init__(self, file_name: str, headers: list[str], directory=None): self.file_path = get_file_path(f"{file_name}.csv", directory) self.headers = headers if not os.path.exists(self.file_path): @@ -29,9 +24,9 @@ def add_row(self, row_values: list[str] | tuple[str]): """ if isinstance(row_values, str): # Single values must be converted to a list format - row_values = [row_values] + row_values = [row_values] try: - with open(self.file_path, mode='a', newline='', encoding='utf-8') as file: + with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: writer = csv.writer(file) writer.writerow(row_values) except Exception as e: @@ -45,9 +40,7 @@ def add_rows(self, results: list[list[str]]) -> None: Returns: None """ for result in results: - self.add_row( - result - ) + self.add_row(result) print(f"{len(results)} URLs written to {self.file_path}") def initialize_file(self): @@ -59,15 +52,17 @@ def initialize_file(self): file_exists = os.path.isfile(self.file_path) if not file_exists: - with open(self.file_path, mode='a', newline='', encoding='utf-8') as file: + with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: writer = csv.writer(file) writer.writerow(self.headers) else: # Open and check that headers match - with open(self.file_path, mode='r', encoding='utf-8') as file: + with open(self.file_path, mode="r", encoding="utf-8") as file: header_row = next(csv.reader(file)) if header_row != self.headers: - raise ValueError(f"Header row in {self.file_path} does not match expected headers") + raise ValueError( + f"Header row in {self.file_path} does not match expected headers" + ) print(f"CSV file initialized at {self.file_path}") def delete_file(self): diff --git a/common_crawler/main.py b/common_crawler/main.py index ae27f556..b9dd012f 100644 --- a/common_crawler/main.py +++ b/common_crawler/main.py @@ -10,7 +10,7 @@ # The below code sets the working directory to be the root of the entire repository # This is done to solve otherwise quite annoying import issues. -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from util.huggingface_api_manager import HuggingFaceAPIManager from util.miscellaneous_functions import get_filename_friendly_timestamp @@ -35,30 +35,34 @@ class BatchInfo: notes: str filename: str + class LabelStudioError(Exception): """Custom exception for Label Studio Errors""" + pass -BATCH_HEADERS = ['Datetime', 'Source', 'Count', 'Keywords', 'Notes', 'Filename'] + +BATCH_HEADERS = ["Datetime", "Source", "Count", "Keywords", "Notes", "Filename"] + def get_current_time(): return str(datetime.now()) -def add_batch_info_to_csv(common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int) -> BatchInfo: +def add_batch_info_to_csv( + common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int +) -> BatchInfo: batch_info = BatchInfo( datetime=get_current_time(), source="Common Crawl", count=str(len(common_crawl_result.url_results)), keywords=f"{args.url} - {args.keyword}", notes=f"{args.common_crawl_id}, {args.pages} pages, starting at {last_page + 1}", - filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}" + filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}", ) batch_info_csv_manager = CSVManager( - file_name='batch_info', - directory=args.data_dir, - headers=BATCH_HEADERS + file_name="batch_info", directory=args.data_dir, headers=BATCH_HEADERS ) batch_info_csv_manager.add_row(dataclasses.astuple(batch_info)) @@ -71,12 +75,11 @@ def main(): # Initialize the Cache cache_manager = CommonCrawlerCacheManager( - file_name=args.cache_filename, - directory=args.data_dir + file_name=args.cache_filename, directory=args.data_dir ) load_dotenv() - + # Initialize the HuggingFace API Manager hf_access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN") if not hf_access_token: @@ -84,10 +87,10 @@ def main(): "HUGGINGFACE_ACCESS_TOKEN not accessible in .env file in root directory. " "Please obtain access token from your personal account at " "https://huggingface.co/settings/tokens and ensure you have write access to " - "https://huggingface.co/PDAP. Then include in .env file in root directory.") + "https://huggingface.co/PDAP. Then include in .env file in root directory." + ) huggingface_api_manager = HuggingFaceAPIManager( - access_token=hf_access_token, - repo_id=args.huggingface_repo_id + access_token=hf_access_token, repo_id=args.huggingface_repo_id ) ls_access_token = os.getenv("LABEL_STUDIO_ACCESS_TOKEN") if not ls_access_token: @@ -95,13 +98,15 @@ def main(): "LABEL_STUDIO_ACCESS_TOKEN not accessible in .env file in root directory. " "Please obtain access token from your personal account at " "https://app.heartex.com/user/account and ensure you have read access to " - "https://app.heartex.com/projects/61550. Then include in .env file in root directory.") + "https://app.heartex.com/projects/61550. Then include in .env file in root directory." + ) ls_project_id = os.getenv("LABEL_STUDIO_PROJECT_ID") if not ls_project_id: raise ValueError( "LABEL_STUDIO_PROJECT_ID not accessible in .env file in root directory. " "Please obtain a project ID by navigating to the Label Studio project " - "where it will be visibile in the url. Then include in .env file in root directory.") + "where it will be visibile in the url. Then include in .env file in root directory." + ) try: print("Retrieving Label Studio data for deduplication") @@ -119,7 +124,9 @@ def main(): try: # Retrieve the last page from the cache, or 0 if it does not exist last_page = cache_manager.get(args.common_crawl_id, args.url, args.keyword) - common_crawl_result = process_crawl_and_upload(args, last_page, huggingface_api_manager, label_studio_results) + common_crawl_result = process_crawl_and_upload( + args, last_page, huggingface_api_manager, label_studio_results + ) except ValueError as e: print(f"Error during crawling: {e}") return @@ -129,12 +136,14 @@ def main(): index=args.common_crawl_id, url=args.url, keyword=args.keyword, - last_page=common_crawl_result.last_page_search) + last_page=common_crawl_result.last_page_search, + ) cache_manager.save_cache() except ValueError as e: print(f"Error while saving cache manager: {e}") + def handle_remote_results_error(remote_results): """ Handles errors in the remote results @@ -151,6 +160,7 @@ def handle_remote_results_error(remote_results): else: raise LabelStudioError(f"Unexpected error: {remote_results}") + def validate_remote_results(remote_results): """ Validates the remote results retrieved from the Label Studio project @@ -166,7 +176,9 @@ def validate_remote_results(remote_results): print("No data in Label Studio project.") return [] elif "url" not in remote_results[0]["data"]: - raise LabelStudioError("Column 'url' not present in Label Studio project. Exiting...") + raise LabelStudioError( + "Column 'url' not present in Label Studio project. Exiting..." + ) else: return remote_results elif isinstance(remote_results, dict): @@ -174,6 +186,7 @@ def validate_remote_results(remote_results): else: raise LabelStudioError("Unexpected response type.") + def get_ls_data() -> list[dict] | None: """Retrieves data from a Label Studio project to be used in deduplication of common crawl results. @@ -190,14 +203,14 @@ def get_ls_data() -> list[dict] | None: def strip_url(url: str) -> str: - """Strips http(s)://www. from the beginning of a url if applicable. + """Strips http(s)://www. from the beginning of a url if applicable. Args: url (str): The URL to strip. Returns: str: The stripped URL. - """ + """ result = re.search(r"^(?:https?://)?(?:www\.)?(.*)$", url).group(1) return result @@ -210,7 +223,7 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]: Returns: list[str]: List of unique URLs. - """ + """ stripped_url_results = [strip_url(url) for url in url_results] unique_urls = collections.deque() adjust = 0 @@ -225,7 +238,9 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]: return url_results -def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dict]) -> list[str]: +def remove_remote_duplicates( + url_results: list[str], label_studio_data: list[dict] +) -> list[str]: """Removes URLs from a list that are already present in the Label Studio project, ignoring http(s)://www. Args: @@ -238,7 +253,9 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic try: remote_urls = [strip_url(task["data"]["url"]) for task in label_studio_data] except TypeError: - print("Invalid Label Studio credentials. Database could not be checked for duplicates.") + print( + "Invalid Label Studio credentials. Database could not be checked for duplicates." + ) return url_results remote_urls = set(remote_urls) @@ -254,10 +271,11 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic def handle_csv_and_upload( - common_crawl_result: CommonCrawlResult, - huggingface_api_manager: HuggingFaceAPIManager, - args: argparse.Namespace, - last_page: int): + common_crawl_result: CommonCrawlResult, + huggingface_api_manager: HuggingFaceAPIManager, + args: argparse.Namespace, + last_page: int, +): """ Handles the CSV file and uploads it to Hugging Face repository. Args: @@ -270,29 +288,27 @@ def handle_csv_and_upload( batch_info = add_batch_info_to_csv(common_crawl_result, args, last_page) csv_manager = CSVManager( - file_name=batch_info.filename, - headers=['url'], - directory=args.data_dir + file_name=batch_info.filename, headers=["url"], directory=args.data_dir ) csv_manager.add_rows(common_crawl_result.url_results) huggingface_api_manager.upload_file( local_file_path=csv_manager.file_path, - repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}" + repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}", ) print( - f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}") + f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}" + ) csv_manager.delete_file() def process_crawl_and_upload( - args: argparse.Namespace, - last_page: int, - huggingface_api_manager: HuggingFaceAPIManager, - label_studio_data: list[dict]) -> CommonCrawlResult: + args: argparse.Namespace, + last_page: int, + huggingface_api_manager: HuggingFaceAPIManager, + label_studio_data: list[dict], +) -> CommonCrawlResult: # Initialize the CommonCrawlerManager - crawler_manager = CommonCrawlerManager( - args.common_crawl_id - ) + crawler_manager = CommonCrawlerManager(args.common_crawl_id) # Determine the pages to search, based on the last page searched start_page = last_page + 1 # Use the parsed arguments @@ -300,7 +316,7 @@ def process_crawl_and_upload( search_term=args.url, keyword=args.keyword, num_pages=args.pages, - start_page=start_page + start_page=start_page, ) # Logic should conclude here if no results are found if not common_crawl_result.url_results: @@ -309,10 +325,16 @@ def process_crawl_and_upload( return common_crawl_result print("Removing urls already in the database") - common_crawl_result.url_results = remove_local_duplicates(common_crawl_result.url_results) - common_crawl_result.url_results = remove_remote_duplicates(common_crawl_result.url_results, label_studio_data) + common_crawl_result.url_results = remove_local_duplicates( + common_crawl_result.url_results + ) + common_crawl_result.url_results = remove_remote_duplicates( + common_crawl_result.url_results, label_studio_data + ) if not common_crawl_result.url_results: - print("No urls not already present in the database found. Ceasing main execution.") + print( + "No urls not already present in the database found. Ceasing main execution." + ) add_batch_info_to_csv(common_crawl_result, args, last_page) return common_crawl_result diff --git a/common_crawler/utils.py b/common_crawler/utils.py index 0848b023..3cea7af2 100644 --- a/common_crawler/utils.py +++ b/common_crawler/utils.py @@ -12,7 +12,7 @@ def __init__(self, url): self.url = url def add_parameter(self, parameter, value): - if '?' in self.url: + if "?" in self.url: self.url += f"&{parameter}={value}" else: self.url += f"?{parameter}={value}" diff --git a/source_collectors/ckan/ckan_scraper_toolkit.py b/source_collectors/ckan/ckan_scraper_toolkit.py index 0d9dc449..5898c9f0 100644 --- a/source_collectors/ckan/ckan_scraper_toolkit.py +++ b/source_collectors/ckan/ckan_scraper_toolkit.py @@ -1,4 +1,5 @@ """Toolkit of functions that use ckanapi to retrieve packages from CKAN data portals""" + from concurrent.futures import as_completed, ThreadPoolExecutor from dataclasses import dataclass, field from datetime import datetime @@ -150,10 +151,7 @@ def ckan_collection_search(base_url: str, collection_id: str) -> list[Package]: for dataset_content in soup.find_all(class_="dataset-content") ] - [ - packages.append(package.result()) - for package in as_completed(futures) - ] + [packages.append(package.result()) for package in as_completed(futures)] # Take a break to avoid being timed out if len(futures) >= 15: @@ -186,10 +184,12 @@ def _collection_search_get_package_data(dataset_content, base_url: str): record_format.text.strip() for record_format in dataset_content.find_all("li") ] package.record_format = list(set(package.record_format)) - + date = dataset_soup.find(property="dct:modified").text.strip() - package.source_last_updated = datetime.strptime(date, "%B %d, %Y").strftime("%Y-%d-%m") - + package.source_last_updated = datetime.strptime(date, "%B %d, %Y").strftime( + "%Y-%d-%m" + ) + return package diff --git a/source_collectors/ckan/scrape_ckan_data_portals.py b/source_collectors/ckan/scrape_ckan_data_portals.py index ef83b4dc..57bd9927 100644 --- a/source_collectors/ckan/scrape_ckan_data_portals.py +++ b/source_collectors/ckan/scrape_ckan_data_portals.py @@ -1,4 +1,5 @@ """Retrieves packages from CKAN data portals and parses relevant information then outputs to a CSV file""" + from itertools import chain import json import sys diff --git a/source_collectors/ckan/search_terms.py b/source_collectors/ckan/search_terms.py index 7fdbc34e..179e58d8 100644 --- a/source_collectors/ckan/search_terms.py +++ b/source_collectors/ckan/search_terms.py @@ -11,7 +11,7 @@ {"url": "https://open.jacksonms.gov/", "terms": ["tags:police"]}, {"url": "https://data.milwaukee.gov/", "terms": ["mpd", "wibr"]}, {"url": "https://data.sanantonio.gov/", "terms": ["sapd"]}, - {"url": "https://data.sanjoseca.gov/", "terms": ["police"]} + {"url": "https://data.sanjoseca.gov/", "terms": ["police"]}, ] group_search = [ diff --git a/source_collectors/common_crawler/argparser.py b/source_collectors/common_crawler/argparser.py index 8cdf5b78..67f4a290 100644 --- a/source_collectors/common_crawler/argparser.py +++ b/source_collectors/common_crawler/argparser.py @@ -7,6 +7,7 @@ for the Common Crawler script. """ + def valid_common_crawl_id(common_crawl_id: str) -> bool: """ Validate the Common Crawl ID format. @@ -16,7 +17,8 @@ def valid_common_crawl_id(common_crawl_id: str) -> bool: Returns: True if the Common Crawl ID is valid, False otherwise """ - return re.match(r'CC-MAIN-\d{4}-\d{2}', common_crawl_id) is not None + return re.match(r"CC-MAIN-\d{4}-\d{2}", common_crawl_id) is not None + def parse_args() -> argparse.Namespace: """ @@ -33,22 +35,41 @@ def parse_args() -> argparse.Namespace: """ parser = argparse.ArgumentParser( - description='Query the Common Crawl dataset and optionally save the results to a file.') + description="Query the Common Crawl dataset and optionally save the results to a file." + ) # Add the required arguments - parser.add_argument('common_crawl_id', type=str, help='The Common Crawl ID') - parser.add_argument('url', type=str, help='The URL to query') - parser.add_argument('keyword', type=str, help='The keyword to search in the url') + parser.add_argument("common_crawl_id", type=str, help="The Common Crawl ID") + parser.add_argument("url", type=str, help="The URL to query") + parser.add_argument("keyword", type=str, help="The keyword to search in the url") # Optional arguments for the number of pages and the output file, and a flag to reset the cache - parser.add_argument('-c', '--config', type=str, default='config.ini', help='The configuration file to use') - parser.add_argument('-p', '--pages', type=int, default=1, help='The number of pages to search (default: 1)') - parser.add_argument('--reset-cache', action='store_true', default=False, - help='Reset the cache before starting the crawl') + parser.add_argument( + "-c", + "--config", + type=str, + default="config.ini", + help="The configuration file to use", + ) + parser.add_argument( + "-p", + "--pages", + type=int, + default=1, + help="The number of pages to search (default: 1)", + ) + parser.add_argument( + "--reset-cache", + action="store_true", + default=False, + help="Reset the cache before starting the crawl", + ) args = parser.parse_args() # Validate the Common Crawl ID format if not valid_common_crawl_id(args.common_crawl_id): - parser.error("Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW.") + parser.error( + "Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW." + ) # Read the configuration file config = configparser.ConfigParser() @@ -56,7 +77,7 @@ def parse_args() -> argparse.Namespace: # Combine parsed arguments with configuration file defaults app_parser = argparse.ArgumentParser(parents=[parser], add_help=False) - app_parser.set_defaults(**config['DEFAULT']) + app_parser.set_defaults(**config["DEFAULT"]) app_args = app_parser.parse_args() diff --git a/source_collectors/common_crawler/cache.py b/source_collectors/common_crawler/cache.py index 2a48c0b7..23d58819 100644 --- a/source_collectors/common_crawler/cache.py +++ b/source_collectors/common_crawler/cache.py @@ -8,11 +8,13 @@ - CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results """ + class CommonCrawlerCacheManager: """ A class for managing the cache of Common Crawl search results. This class is responsible for adding, retrieving, and saving cache data. """ + def __init__(self, file_name: str = "cache", directory=None): """ Initializes the CacheStorage object with a file name and directory. @@ -41,7 +43,6 @@ def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None: self.cache[index][url] = {} self.cache[index][url][keyword] = last_page - def get(self, index, url, keyword) -> int: """ Retrieves a page number from the cache. @@ -53,12 +54,15 @@ def get(self, index, url, keyword) -> int: Returns: int - the last page crawled """ - if index in self.cache and url in self.cache[index] and keyword in self.cache[index][url]: + if ( + index in self.cache + and url in self.cache[index] + and keyword in self.cache[index][url] + ): return self.cache[index][url][keyword] # The cache object does not exist. Return 0 as the default value. return 0 - def load_or_create_cache(self) -> dict: """ Loads the cache from the configured file path. @@ -66,12 +70,11 @@ def load_or_create_cache(self) -> dict: Returns: dict - the cache data """ try: - with open(self.file_path, 'r') as file: + with open(self.file_path, "r") as file: return json.load(file) except FileNotFoundError: return {} - def save_cache(self) -> None: """ Converts the cache object into a JSON-serializable format and saves it to the configured file path. @@ -79,10 +82,9 @@ def save_cache(self) -> None: persistence of crawl data across sessions. """ # Reformat cache data for JSON serialization - with open(self.file_path, 'w') as file: + with open(self.file_path, "w") as file: json.dump(self.cache, file, indent=4) - def reset_cache(self) -> None: """ Resets the cache to an empty state. diff --git a/source_collectors/common_crawler/crawler.py b/source_collectors/common_crawler/crawler.py index 9afba7d8..0982ca53 100644 --- a/source_collectors/common_crawler/crawler.py +++ b/source_collectors/common_crawler/crawler.py @@ -16,7 +16,6 @@ # TODO: What happens when no results are found? How does the CommonCrawlerManager handle this? - @dataclass class CommonCrawlResult: last_page_search: int @@ -31,16 +30,17 @@ class CommonCrawlerManager: It validates crawl ids, manages pagination, and aggregates results. """ - def __init__(self, crawl_id='CC-MAIN-2023-50'): + def __init__(self, crawl_id="CC-MAIN-2023-50"): self.crawl_id = crawl_id - CC_INDEX_SERVER = 'http://index.commoncrawl.org/' - INDEX_NAME = f'{self.crawl_id}-index' - self.root_url = f'{CC_INDEX_SERVER}{INDEX_NAME}' + CC_INDEX_SERVER = "http://index.commoncrawl.org/" + INDEX_NAME = f"{self.crawl_id}-index" + self.root_url = f"{CC_INDEX_SERVER}{INDEX_NAME}" def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResult: print( f"Searching for {keyword} on {search_term} in {self.crawl_id} for {num_pages} pages," - f" starting at page {start_page}") + f" starting at page {start_page}" + ) url_results = [] @@ -64,7 +64,9 @@ def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResul return CommonCrawlResult(last_page, url_results) - def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = 20) -> list[dict]: + def search_common_crawl_index( + self, url: str, page: int = 0, max_retries: int = 20 + ) -> list[dict]: """ This method is used to search the Common Crawl index for a given URL and page number Args: @@ -76,9 +78,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = """ encoded_url = quote_plus(url) search_url = URLWithParameters(self.root_url) - search_url.add_parameter('url', encoded_url) - search_url.add_parameter('output', 'json') - search_url.add_parameter('page', page) + search_url.add_parameter("url", encoded_url) + search_url.add_parameter("output", "json") + search_url.add_parameter("page", page) retries = 0 delay = 1 @@ -90,7 +92,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = return self.process_response(response, url, page) retries += 1 - print(f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})") + print( + f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})" + ) time.sleep(delay) print(f"Max retries exceeded. Failed to get records for {url} on page {page}.") @@ -106,19 +110,24 @@ def make_request(self, search_url: str) -> requests.Response: response.raise_for_status() return response except requests.exceptions.RequestException as e: - if response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR and 'SlowDown' in response.text: + if ( + response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR + and "SlowDown" in response.text + ): return None else: print(f"Failed to get records: {e}") return None - def process_response(self, response: requests.Response, url: str, page: int) -> list[dict]: + def process_response( + self, response: requests.Response, url: str, page: int + ) -> list[dict]: """Processes the HTTP response and returns the parsed records if successful.""" if response.status_code == HTTPStatus.OK: - records = response.text.strip().split('\n') + records = response.text.strip().split("\n") print(f"Found {len(records)} records for {url} on page {page}") return [json.loads(record) for record in records] - elif 'First Page is 0, Last Page is 0' in response.text: + elif "First Page is 0, Last Page is 0" in response.text: print("No records exist in index matching the url search term") return None else: @@ -127,4 +136,4 @@ def process_response(self, response: requests.Response, url: str, page: int) -> @staticmethod def get_urls_with_keyword(records: list[dict], keyword) -> list[str]: - return [record['url'] for record in records if keyword in record['url']] + return [record["url"] for record in records if keyword in record["url"]] diff --git a/source_collectors/common_crawler/csv_manager.py b/source_collectors/common_crawler/csv_manager.py index 69868629..2b823b42 100644 --- a/source_collectors/common_crawler/csv_manager.py +++ b/source_collectors/common_crawler/csv_manager.py @@ -10,12 +10,7 @@ class CSVManager: Creates the file if it doesn't exist, and provides a method for adding new rows. """ - def __init__( - self, - file_name: str, - headers: list[str], - directory=None - ): + def __init__(self, file_name: str, headers: list[str], directory=None): self.file_path = get_file_path(f"{file_name}.csv", directory) self.headers = headers if not os.path.exists(self.file_path): @@ -29,9 +24,9 @@ def add_row(self, row_values: list[str] | tuple[str]): """ if isinstance(row_values, str): # Single values must be converted to a list format - row_values = [row_values] + row_values = [row_values] try: - with open(self.file_path, mode='a', newline='', encoding='utf-8') as file: + with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: writer = csv.writer(file) writer.writerow(row_values) except Exception as e: @@ -45,9 +40,7 @@ def add_rows(self, results: list[list[str]]) -> None: Returns: None """ for result in results: - self.add_row( - result - ) + self.add_row(result) print(f"{len(results)} URLs written to {self.file_path}") def initialize_file(self): @@ -59,15 +52,17 @@ def initialize_file(self): file_exists = os.path.isfile(self.file_path) if not file_exists: - with open(self.file_path, mode='a', newline='', encoding='utf-8') as file: + with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: writer = csv.writer(file) writer.writerow(self.headers) else: # Open and check that headers match - with open(self.file_path, mode='r', encoding='utf-8') as file: + with open(self.file_path, mode="r", encoding="utf-8") as file: header_row = next(csv.reader(file)) if header_row != self.headers: - raise ValueError(f"Header row in {self.file_path} does not match expected headers") + raise ValueError( + f"Header row in {self.file_path} does not match expected headers" + ) print(f"CSV file initialized at {self.file_path}") def delete_file(self): diff --git a/source_collectors/common_crawler/main.py b/source_collectors/common_crawler/main.py index ae27f556..b9dd012f 100644 --- a/source_collectors/common_crawler/main.py +++ b/source_collectors/common_crawler/main.py @@ -10,7 +10,7 @@ # The below code sets the working directory to be the root of the entire repository # This is done to solve otherwise quite annoying import issues. -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from util.huggingface_api_manager import HuggingFaceAPIManager from util.miscellaneous_functions import get_filename_friendly_timestamp @@ -35,30 +35,34 @@ class BatchInfo: notes: str filename: str + class LabelStudioError(Exception): """Custom exception for Label Studio Errors""" + pass -BATCH_HEADERS = ['Datetime', 'Source', 'Count', 'Keywords', 'Notes', 'Filename'] + +BATCH_HEADERS = ["Datetime", "Source", "Count", "Keywords", "Notes", "Filename"] + def get_current_time(): return str(datetime.now()) -def add_batch_info_to_csv(common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int) -> BatchInfo: +def add_batch_info_to_csv( + common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int +) -> BatchInfo: batch_info = BatchInfo( datetime=get_current_time(), source="Common Crawl", count=str(len(common_crawl_result.url_results)), keywords=f"{args.url} - {args.keyword}", notes=f"{args.common_crawl_id}, {args.pages} pages, starting at {last_page + 1}", - filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}" + filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}", ) batch_info_csv_manager = CSVManager( - file_name='batch_info', - directory=args.data_dir, - headers=BATCH_HEADERS + file_name="batch_info", directory=args.data_dir, headers=BATCH_HEADERS ) batch_info_csv_manager.add_row(dataclasses.astuple(batch_info)) @@ -71,12 +75,11 @@ def main(): # Initialize the Cache cache_manager = CommonCrawlerCacheManager( - file_name=args.cache_filename, - directory=args.data_dir + file_name=args.cache_filename, directory=args.data_dir ) load_dotenv() - + # Initialize the HuggingFace API Manager hf_access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN") if not hf_access_token: @@ -84,10 +87,10 @@ def main(): "HUGGINGFACE_ACCESS_TOKEN not accessible in .env file in root directory. " "Please obtain access token from your personal account at " "https://huggingface.co/settings/tokens and ensure you have write access to " - "https://huggingface.co/PDAP. Then include in .env file in root directory.") + "https://huggingface.co/PDAP. Then include in .env file in root directory." + ) huggingface_api_manager = HuggingFaceAPIManager( - access_token=hf_access_token, - repo_id=args.huggingface_repo_id + access_token=hf_access_token, repo_id=args.huggingface_repo_id ) ls_access_token = os.getenv("LABEL_STUDIO_ACCESS_TOKEN") if not ls_access_token: @@ -95,13 +98,15 @@ def main(): "LABEL_STUDIO_ACCESS_TOKEN not accessible in .env file in root directory. " "Please obtain access token from your personal account at " "https://app.heartex.com/user/account and ensure you have read access to " - "https://app.heartex.com/projects/61550. Then include in .env file in root directory.") + "https://app.heartex.com/projects/61550. Then include in .env file in root directory." + ) ls_project_id = os.getenv("LABEL_STUDIO_PROJECT_ID") if not ls_project_id: raise ValueError( "LABEL_STUDIO_PROJECT_ID not accessible in .env file in root directory. " "Please obtain a project ID by navigating to the Label Studio project " - "where it will be visibile in the url. Then include in .env file in root directory.") + "where it will be visibile in the url. Then include in .env file in root directory." + ) try: print("Retrieving Label Studio data for deduplication") @@ -119,7 +124,9 @@ def main(): try: # Retrieve the last page from the cache, or 0 if it does not exist last_page = cache_manager.get(args.common_crawl_id, args.url, args.keyword) - common_crawl_result = process_crawl_and_upload(args, last_page, huggingface_api_manager, label_studio_results) + common_crawl_result = process_crawl_and_upload( + args, last_page, huggingface_api_manager, label_studio_results + ) except ValueError as e: print(f"Error during crawling: {e}") return @@ -129,12 +136,14 @@ def main(): index=args.common_crawl_id, url=args.url, keyword=args.keyword, - last_page=common_crawl_result.last_page_search) + last_page=common_crawl_result.last_page_search, + ) cache_manager.save_cache() except ValueError as e: print(f"Error while saving cache manager: {e}") + def handle_remote_results_error(remote_results): """ Handles errors in the remote results @@ -151,6 +160,7 @@ def handle_remote_results_error(remote_results): else: raise LabelStudioError(f"Unexpected error: {remote_results}") + def validate_remote_results(remote_results): """ Validates the remote results retrieved from the Label Studio project @@ -166,7 +176,9 @@ def validate_remote_results(remote_results): print("No data in Label Studio project.") return [] elif "url" not in remote_results[0]["data"]: - raise LabelStudioError("Column 'url' not present in Label Studio project. Exiting...") + raise LabelStudioError( + "Column 'url' not present in Label Studio project. Exiting..." + ) else: return remote_results elif isinstance(remote_results, dict): @@ -174,6 +186,7 @@ def validate_remote_results(remote_results): else: raise LabelStudioError("Unexpected response type.") + def get_ls_data() -> list[dict] | None: """Retrieves data from a Label Studio project to be used in deduplication of common crawl results. @@ -190,14 +203,14 @@ def get_ls_data() -> list[dict] | None: def strip_url(url: str) -> str: - """Strips http(s)://www. from the beginning of a url if applicable. + """Strips http(s)://www. from the beginning of a url if applicable. Args: url (str): The URL to strip. Returns: str: The stripped URL. - """ + """ result = re.search(r"^(?:https?://)?(?:www\.)?(.*)$", url).group(1) return result @@ -210,7 +223,7 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]: Returns: list[str]: List of unique URLs. - """ + """ stripped_url_results = [strip_url(url) for url in url_results] unique_urls = collections.deque() adjust = 0 @@ -225,7 +238,9 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]: return url_results -def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dict]) -> list[str]: +def remove_remote_duplicates( + url_results: list[str], label_studio_data: list[dict] +) -> list[str]: """Removes URLs from a list that are already present in the Label Studio project, ignoring http(s)://www. Args: @@ -238,7 +253,9 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic try: remote_urls = [strip_url(task["data"]["url"]) for task in label_studio_data] except TypeError: - print("Invalid Label Studio credentials. Database could not be checked for duplicates.") + print( + "Invalid Label Studio credentials. Database could not be checked for duplicates." + ) return url_results remote_urls = set(remote_urls) @@ -254,10 +271,11 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic def handle_csv_and_upload( - common_crawl_result: CommonCrawlResult, - huggingface_api_manager: HuggingFaceAPIManager, - args: argparse.Namespace, - last_page: int): + common_crawl_result: CommonCrawlResult, + huggingface_api_manager: HuggingFaceAPIManager, + args: argparse.Namespace, + last_page: int, +): """ Handles the CSV file and uploads it to Hugging Face repository. Args: @@ -270,29 +288,27 @@ def handle_csv_and_upload( batch_info = add_batch_info_to_csv(common_crawl_result, args, last_page) csv_manager = CSVManager( - file_name=batch_info.filename, - headers=['url'], - directory=args.data_dir + file_name=batch_info.filename, headers=["url"], directory=args.data_dir ) csv_manager.add_rows(common_crawl_result.url_results) huggingface_api_manager.upload_file( local_file_path=csv_manager.file_path, - repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}" + repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}", ) print( - f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}") + f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}" + ) csv_manager.delete_file() def process_crawl_and_upload( - args: argparse.Namespace, - last_page: int, - huggingface_api_manager: HuggingFaceAPIManager, - label_studio_data: list[dict]) -> CommonCrawlResult: + args: argparse.Namespace, + last_page: int, + huggingface_api_manager: HuggingFaceAPIManager, + label_studio_data: list[dict], +) -> CommonCrawlResult: # Initialize the CommonCrawlerManager - crawler_manager = CommonCrawlerManager( - args.common_crawl_id - ) + crawler_manager = CommonCrawlerManager(args.common_crawl_id) # Determine the pages to search, based on the last page searched start_page = last_page + 1 # Use the parsed arguments @@ -300,7 +316,7 @@ def process_crawl_and_upload( search_term=args.url, keyword=args.keyword, num_pages=args.pages, - start_page=start_page + start_page=start_page, ) # Logic should conclude here if no results are found if not common_crawl_result.url_results: @@ -309,10 +325,16 @@ def process_crawl_and_upload( return common_crawl_result print("Removing urls already in the database") - common_crawl_result.url_results = remove_local_duplicates(common_crawl_result.url_results) - common_crawl_result.url_results = remove_remote_duplicates(common_crawl_result.url_results, label_studio_data) + common_crawl_result.url_results = remove_local_duplicates( + common_crawl_result.url_results + ) + common_crawl_result.url_results = remove_remote_duplicates( + common_crawl_result.url_results, label_studio_data + ) if not common_crawl_result.url_results: - print("No urls not already present in the database found. Ceasing main execution.") + print( + "No urls not already present in the database found. Ceasing main execution." + ) add_batch_info_to_csv(common_crawl_result, args, last_page) return common_crawl_result diff --git a/source_collectors/common_crawler/utils.py b/source_collectors/common_crawler/utils.py index 0848b023..3cea7af2 100644 --- a/source_collectors/common_crawler/utils.py +++ b/source_collectors/common_crawler/utils.py @@ -12,7 +12,7 @@ def __init__(self, url): self.url = url def add_parameter(self, parameter, value): - if '?' in self.url: + if "?" in self.url: self.url += f"&{parameter}={value}" else: self.url += f"?{parameter}={value}" diff --git a/source_collectors/muckrock/convert_all_record_types_to_csv.py b/source_collectors/muckrock/convert_all_record_types_to_csv.py index be6d5364..30acdbbe 100644 --- a/source_collectors/muckrock/convert_all_record_types_to_csv.py +++ b/source_collectors/muckrock/convert_all_record_types_to_csv.py @@ -1,12 +1,43 @@ -import subprocess -import os +# import subprocess +# import os -record_types = ['accident reports', 'arrest records', 'calls for service', 'car gps', 'citations', 'dispatch logs', 'dispatch recordings', - 'field contacts', 'incident reports', 'misc police activity', 'officer involved shootings', 'stops', 'surveys', 'use of force reports', - 'vehicle pursuits', 'complaints and misconduct', 'daily activity logs', 'training and hiring info', 'personnel records', 'annual and monthly reports', - 'budgets and finances', 'contact info and agency meta', 'geographic', 'list of data sources', 'policies and contracts', 'crime maps and reports', - 'crime statistics', 'media bulletins', 'records request info', 'resources', 'sex offender registry', 'wanted persons', 'booking reports', - 'court cases', 'incarceration records'] +record_types = [ + "accident reports", + "arrest records", + "calls for service", + "car gps", + "citations", + "dispatch logs", + "dispatch recordings", + "field contacts", + "incident reports", + "misc police activity", + "officer involved shootings", + "stops", + "surveys", + "use of force reports", + "vehicle pursuits", + "complaints and misconduct", + "daily activity logs", + "training and hiring info", + "personnel records", + "annual and monthly reports", + "budgets and finances", + "contact info and agency meta", + "geographic", + "list of data sources", + "policies and contracts", + "crime maps and reports", + "crime statistics", + "media bulletins", + "records request info", + "resources", + "sex offender registry", + "wanted persons", + "booking reports", + "court cases", + "incarceration records", +] print(len(record_types)) # json_files = [] diff --git a/source_collectors/muckrock/create_foia_data_db.py b/source_collectors/muckrock/create_foia_data_db.py index 44801055..4adc5556 100644 --- a/source_collectors/muckrock/create_foia_data_db.py +++ b/source_collectors/muckrock/create_foia_data_db.py @@ -1,4 +1,4 @@ -''' +""" create_foia_data_db.py This script fetches data from the MuckRock FOIA API and stores it in a SQLite database. @@ -17,8 +17,7 @@ Error Handling: Errors encountered during API requests or database operations are logged to an `errors.log` file and/or printed to the console. -''' - +""" import requests import sqlite3 @@ -28,18 +27,19 @@ import time from typing import List, Tuple, Dict, Any, Union, Literal -logging.basicConfig(filename='errors.log', level=logging.ERROR, - format='%(levelname)s: %(message)s') +logging.basicConfig( + filename="errors.log", level=logging.ERROR, format="%(levelname)s: %(message)s" +) -base_url = 'https://www.muckrock.com/api_v1/foia/' -last_page_fetched = 'last_page_fetched.txt' +base_url = "https://www.muckrock.com/api_v1/foia/" +last_page_fetched = "last_page_fetched.txt" NO_MORE_DATA = -1 # flag for program exit JSON = Dict[str, Any] # type alias -create_table_query = ''' +create_table_query = """ CREATE TABLE IF NOT EXISTS results ( id INTEGER PRIMARY KEY, title TEXT, @@ -63,20 +63,20 @@ communications TEXT, absolute_url TEXT ) - ''' + """ -foia_insert_query = ''' +foia_insert_query = """ INSERT INTO results (id, title, slug, status, embargo_status, user, username, agency, datetime_submitted, date_due, days_until_due, date_followup, datetime_done, datetime_updated, date_embargo, tracking_id, price, disable_autofollowups, tags, communications, absolute_url) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - ''' + """ def create_db() -> bool: - ''' + """ Creates foia_data.db SQLite database with one table named `results`. Returns: @@ -84,23 +84,22 @@ def create_db() -> bool: Raises: sqlite3.Error: If the table creation operation fails, prints error and returns False. - ''' + """ try: - with sqlite3.connect('foia_data.db') as conn: + with sqlite3.connect("foia_data.db") as conn: conn.execute(create_table_query) conn.commit() - print('Successfully created foia_data.db!') + print("Successfully created foia_data.db!") return True except sqlite3.Error as e: - print(f'SQLite error: {e}.') - logging.error( - f'Failed to create foia_data.db due to SQLite error: {e}') + print(f"SQLite error: {e}.") + logging.error(f"Failed to create foia_data.db due to SQLite error: {e}") return False def fetch_page(page: int) -> Union[JSON, Literal[NO_MORE_DATA], None]: - ''' + """ Fetches a page of 100 results from the MuckRock FOIA API. Args: @@ -111,30 +110,33 @@ def fetch_page(page: int) -> Union[JSON, Literal[NO_MORE_DATA], None]: - JSON Dict[str, Any]: The response's JSON data, if the request is successful. - NO_MORE_DATA (int = -1): A constant, if there are no more pages to fetch (indicated by a 404 response). - None: If there is an error other than 404. - ''' + """ per_page = 100 response = requests.get( - base_url, params={'page': page, 'page_size': per_page, 'format': 'json'}) + base_url, params={"page": page, "page_size": per_page, "format": "json"} + ) if response.status_code == 200: return response.json() elif response.status_code == 404: - print('No more pages to fetch') + print("No more pages to fetch") return NO_MORE_DATA # Typically 404 response will mean there are no more pages to fetch elif 500 <= response.status_code < 600: - logging.error(f'Server error {response.status_code} on page {page}') + logging.error(f"Server error {response.status_code} on page {page}") page = page + 1 return fetch_page(page) else: - print(f'Error fetching page {page}: {response.status_code}') - logging.error(f'Fetching page {page} failed with response code: { - response.status_code}') + print(f"Error fetching page {page}: {response.status_code}") + logging.error( + f"Fetching page {page} failed with response code: { + response.status_code}" + ) return None def transform_page_data(data_to_transform: JSON) -> List[Tuple[Any, ...]]: - ''' + """ Transforms the data recieved from the MuckRock FOIA API into a structured format for insertion into a database with `populate_db()`. Transforms JSON input into a list of tuples, as well as serializes the nested `tags` and `communications` fields into JSON strings. @@ -144,43 +146,44 @@ def transform_page_data(data_to_transform: JSON) -> List[Tuple[Any, ...]]: Returns: transformed_data (List[Tuple[Any, ...]]: A list of tuples, where each tuple contains the fields of a single FOIA request. - ''' + """ transformed_data = [] - for result in data_to_transform.get('results', []): - result['tags'] = json.dumps(result.get('tags', [])) - result['communications'] = json.dumps( - result.get('communications', [])) - - transformed_data.append(( - result['id'], - result['title'], - result['slug'], - result['status'], - result['embargo_status'], - result['user'], - result['username'], - result['agency'], - result['datetime_submitted'], - result['date_due'], - result['days_until_due'], - result['date_followup'], - result['datetime_done'], - result['datetime_updated'], - result['date_embargo'], - result['tracking_id'], - result['price'], - result['disable_autofollowups'], - result['tags'], - result['communications'], - result['absolute_url'] - )) + for result in data_to_transform.get("results", []): + result["tags"] = json.dumps(result.get("tags", [])) + result["communications"] = json.dumps(result.get("communications", [])) + + transformed_data.append( + ( + result["id"], + result["title"], + result["slug"], + result["status"], + result["embargo_status"], + result["user"], + result["username"], + result["agency"], + result["datetime_submitted"], + result["date_due"], + result["days_until_due"], + result["date_followup"], + result["datetime_done"], + result["datetime_updated"], + result["date_embargo"], + result["tracking_id"], + result["price"], + result["disable_autofollowups"], + result["tags"], + result["communications"], + result["absolute_url"], + ) + ) return transformed_data def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None: - ''' + """ Populates foia_data.db SQLite database with the transfomed FOIA request data. Args: @@ -193,9 +196,9 @@ def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None: Raises: sqlite3.Error: If the insertion operation fails, attempts to retry operation (max_retries = 2). If retries are exhausted, logs error and exits. - ''' + """ - with sqlite3.connect('foia_data.db') as conn: + with sqlite3.connect("foia_data.db") as conn: retries = 0 max_retries = 2 @@ -203,51 +206,55 @@ def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None: try: conn.executemany(foia_insert_query, transformed_data) conn.commit() - print('Successfully inserted data!') + print("Successfully inserted data!") return except sqlite3.Error as e: - print(f'SQLite error: {e}. Retrying...') + print(f"SQLite error: {e}. Retrying...") conn.rollback() retries += 1 time.sleep(1) if retries == max_retries: - print(f'Failed to insert data from page {page} after { - max_retries} attempts. Skipping to next page.') - logging.error(f'Failed to insert data from page {page} after { - max_retries} attempts.') + print( + f"Failed to insert data from page {page} after { + max_retries} attempts. Skipping to next page." + ) + logging.error( + f"Failed to insert data from page {page} after { + max_retries} attempts." + ) def main() -> None: - ''' + """ Main entry point for create_foia_data_db.py. This function orchestrates the process of fetching FOIA requests data from the MuckRock FOIA API, transforming it, and storing it in a SQLite database. - ''' + """ - if not os.path.exists('foia_data.db'): - print('Creating foia_data.db...') + if not os.path.exists("foia_data.db"): + print("Creating foia_data.db...") success = create_db() if success == False: - print('Failed to create foia_data.db') + print("Failed to create foia_data.db") return if os.path.exists(last_page_fetched): - with open(last_page_fetched, mode='r') as file: + with open(last_page_fetched, mode="r") as file: page = int(file.read()) + 1 else: page = 1 while True: - print(f'Fetching page {page}...') + print(f"Fetching page {page}...") page_data = fetch_page(page) if page_data == NO_MORE_DATA: break # Exit program because no more data exixts if page_data is None: - print(f'Skipping page {page}...') + print(f"Skipping page {page}...") page += 1 continue @@ -255,16 +262,18 @@ def main() -> None: populate_db(transformed_data, page) - with open(last_page_fetched, mode='w') as file: + with open(last_page_fetched, mode="w") as file: file.write(str(page)) page += 1 - print('create_foia_data_db.py run finished') + print("create_foia_data_db.py run finished") -if __name__ == '__main__': +if __name__ == "__main__": try: main() except Exception as e: - logging.error(f'An unexpected error occurred: {e}') - print('Check errors.log to review errors. Run create_foia_data_db.py again to continue') + logging.error(f"An unexpected error occurred: {e}") + print( + "Check errors.log to review errors. Run create_foia_data_db.py again to continue" + ) diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py index c1a0380f..86ede5d9 100644 --- a/source_collectors/muckrock/download_muckrock_foia.py +++ b/source_collectors/muckrock/download_muckrock_foia.py @@ -12,15 +12,19 @@ all_data = [] output_file = "foia_data.json" + # Function to fetch data from a specific page def fetch_page(page): - response = requests.get(base_url, params={"page": page, "page_size": per_page, "format": "json"}) + response = requests.get( + base_url, params={"page": page, "page_size": per_page, "format": "json"} + ) if response.status_code == 200: return response.json() else: print(f"Error fetching page {page}: {response.status_code}") return None + # Fetch and store data from all pages while True: print(f"Fetching page {page}...") @@ -30,14 +34,14 @@ def fetch_page(page): page += 1 continue - all_data.extend(data['results']) - if not data['next']: + all_data.extend(data["results"]) + if not data["next"]: break page += 1 # Write data to CSV -with open(output_file, mode='w', encoding='utf-8') as json_file: +with open(output_file, mode="w", encoding="utf-8") as json_file: json.dump(all_data, json_file, indent=4) print(f"Data written to {output_file}") diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index 4d57737d..455084a7 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -7,23 +7,48 @@ # Load the JSON data parser = argparse.ArgumentParser(description="Parse JSON from a file.") -parser.add_argument('--json_file', type=str, required=True, - help="Path to the JSON file") +parser.add_argument( + "--json_file", type=str, required=True, help="Path to the JSON file" +) args = parser.parse_args() -with open(args.json_file, 'r') as f: +with open(args.json_file, "r") as f: json_data = json.load(f) # Define the CSV headers headers = [ - "name", "agency_described", "record_type", "description", "source_url", - "readme_url", "scraper_url", "state", "county", "municipality", - "agency_type", "jurisdiction_type", "View Archive", "agency_aggregation", - "agency_supplied", "supplying_entity", "agency_originated", "originating_agency", - "coverage_start", "source_last_updated", "coverage_end", "number_of_records_available", - "size", "access_type", "data_portal_type", "access_notes", "record_format", "update_frequency", - "update_method", "retention_schedule", "detail_level" + "name", + "agency_described", + "record_type", + "description", + "source_url", + "readme_url", + "scraper_url", + "state", + "county", + "municipality", + "agency_type", + "jurisdiction_type", + "View Archive", + "agency_aggregation", + "agency_supplied", + "supplying_entity", + "agency_originated", + "originating_agency", + "coverage_start", + "source_last_updated", + "coverage_end", + "number_of_records_available", + "size", + "access_type", + "data_portal_type", + "access_notes", + "record_format", + "update_frequency", + "update_method", + "retention_schedule", + "detail_level", ] @@ -59,7 +84,7 @@ def get_jurisdiction(jurisdiction_id): output_csv = format_filename_json_to_csv(args.json_file) # Open a CSV file for writing -with open(output_csv, 'w', newline='') as csvfile: +with open(output_csv, "w", newline="") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=headers) # Write the header row @@ -87,8 +112,7 @@ def get_jurisdiction(jurisdiction_id): juris_type = "state" # local jurisdiction level if jurisdiction_level == "l": - parent_juris_data = get_jurisdiction( - jurisdiction_data.get("parent")) + parent_juris_data = get_jurisdiction(jurisdiction_data.get("parent")) state = parent_juris_data.get("abbrev") if "County" in jurisdiction_data.get("name"): county = jurisdiction_data.get("name") @@ -99,24 +123,24 @@ def get_jurisdiction(jurisdiction_id): municipality = jurisdiction_data.get("name") juris_type = "local" - if 'Police' in agency_data.get("types"): - agency_type = 'law enforcement/police' + if "Police" in agency_data.get("types"): + agency_type = "law enforcement/police" else: - agency_type = '' + agency_type = "" - source_url = '' + source_url = "" absolute_url = item.get("absolute_url") - access_type = '' + access_type = "" for comm in item["communications"]: if comm["files"]: - source_url = absolute_url + '#files' - access_type = 'Web page,Download,API' + source_url = absolute_url + "#files" + access_type = "Web page,Download,API" break # Extract the relevant fields from the JSON object csv_row = { "name": item.get("title", ""), - "agency_described": agency_data.get("name", "") + ' - ' + state, + "agency_described": agency_data.get("name", "") + " - " + state, "record_type": "", "description": "", "source_url": source_url, @@ -145,7 +169,7 @@ def get_jurisdiction(jurisdiction_id): "update_frequency": "", "update_method": "", "retention_schedule": "", - "detail_level": "" + "detail_level": "", } # Write the extracted row to the CSV file diff --git a/source_collectors/muckrock/get_all_record_types.py b/source_collectors/muckrock/get_all_record_types.py index bcc8c0b7..6fa955d2 100644 --- a/source_collectors/muckrock/get_all_record_types.py +++ b/source_collectors/muckrock/get_all_record_types.py @@ -1,17 +1,50 @@ import subprocess -record_types = ['accident reports', 'arrest records', 'calls for service', 'car gps', 'citations', 'dispatch logs', 'dispatch recordings', - 'field contacts', 'incident reports', 'misc police activity', 'officer involved shootings', 'stops', 'surveys', 'use of force reports', - 'vehicle pursuits', 'complaints and misconduct', 'daily activity logs', 'training and hiring info', 'personnel records', 'annual and monthly reports', - 'budgets and finances', 'contact info and agency meta', 'geographic', 'list of data sources', 'policies and contracts', 'crime maps and reports', - 'crime statistics', 'media bulletins', 'records request info', 'resources', 'sex offender registry', 'wanted persons', 'booking reports', - 'court cases', 'incarceration records'] +record_types = [ + "accident reports", + "arrest records", + "calls for service", + "car gps", + "citations", + "dispatch logs", + "dispatch recordings", + "field contacts", + "incident reports", + "misc police activity", + "officer involved shootings", + "stops", + "surveys", + "use of force reports", + "vehicle pursuits", + "complaints and misconduct", + "daily activity logs", + "training and hiring info", + "personnel records", + "annual and monthly reports", + "budgets and finances", + "contact info and agency meta", + "geographic", + "list of data sources", + "policies and contracts", + "crime maps and reports", + "crime statistics", + "media bulletins", + "records request info", + "resources", + "sex offender registry", + "wanted persons", + "booking reports", + "court cases", + "incarceration records", +] for record_type in record_types: - command = ['python', 'search_foia_data_db.py', '--search_for', record_type] + command = ["python", "search_foia_data_db.py", "--search_for", record_type] try: subprocess.run(command, check=True) except subprocess.CalledProcessError as e: - print(f'An error occurred while executing the command for "{ - record_type}": {e}') + print( + f'An error occurred while executing the command for "{ + record_type}": {e}' + ) diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py index 96cde838..02f7a4ec 100644 --- a/source_collectors/muckrock/get_allegheny_foias.py +++ b/source_collectors/muckrock/get_allegheny_foias.py @@ -2,6 +2,7 @@ import json import time + # Function to fetch jurisdiction IDs based on town names from a text file def fetch_jurisdiction_ids(town_file, base_url): with open(town_file, "r") as file: @@ -14,12 +15,14 @@ def fetch_jurisdiction_ids(town_file, base_url): response = requests.get(url) if response.status_code == 200: data = response.json() - for item in data.get('results', []): - if item['name'] in town_names: - jurisdiction_ids[item['name']] = item['id'] + for item in data.get("results", []): + if item["name"] in town_names: + jurisdiction_ids[item["name"]] = item["id"] url = data.get("next") - print(f"Processed page, found {len(jurisdiction_ids)}/{len(town_names)} jurisdictions so far...") + print( + f"Processed page, found {len(jurisdiction_ids)}/{len(town_names)} jurisdictions so far..." + ) time.sleep(1) # To respect the rate limit elif response.status_code == 503: @@ -31,6 +34,7 @@ def fetch_jurisdiction_ids(town_file, base_url): return jurisdiction_ids + # Function to fetch FOIA data for each jurisdiction ID and save it to a JSON file def fetch_foia_data(jurisdiction_ids): all_data = [] @@ -42,7 +46,9 @@ def fetch_foia_data(jurisdiction_ids): data = response.json() all_data.extend(data.get("results", [])) url = data.get("next") - print(f"Fetching records for {name}, {len(all_data)} total records so far...") + print( + f"Fetching records for {name}, {len(all_data)} total records so far..." + ) time.sleep(1) # To respect the rate limit elif response.status_code == 503: print(f"Error 503: Skipping page for {name}") @@ -57,10 +63,13 @@ def fetch_foia_data(jurisdiction_ids): print(f"Saved {len(all_data)} records to foia_data_combined.json") + # Main function to execute the script def main(): town_file = "allegheny-county-towns.txt" - jurisdiction_url = "https://www.muckrock.com/api_v1/jurisdiction/?level=l&parent=126" + jurisdiction_url = ( + "https://www.muckrock.com/api_v1/jurisdiction/?level=l&parent=126" + ) # Fetch jurisdiction IDs based on town names jurisdiction_ids = fetch_jurisdiction_ids(town_file, jurisdiction_url) @@ -69,6 +78,7 @@ def main(): # Fetch FOIA data for each jurisdiction ID fetch_foia_data(jurisdiction_ids) + # Run the main function if __name__ == "__main__": main() diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py index ed1db454..a0160a86 100644 --- a/source_collectors/muckrock/muck_get.py +++ b/source_collectors/muckrock/muck_get.py @@ -14,17 +14,23 @@ while True: # Make the GET request with the search string as a query parameter - response = requests.get(base_url, params={"page" : page, "page_size" : per_page, "format": "json"}) + response = requests.get( + base_url, params={"page": page, "page_size": per_page, "format": "json"} + ) # Check if the request was successful if response.status_code == 200: # Parse the JSON response data = response.json() - if not data['results']: + if not data["results"]: break - filtered_results = [item for item in data['results'] if search_string.lower() in item['title'].lower()] + filtered_results = [ + item + for item in data["results"] + if search_string.lower() in item["title"].lower() + ] all_results.extend(filtered_results) @@ -44,7 +50,7 @@ # Dump list into a JSON file json_out_file = search_string.replace(" ", "_") + ".json" -with open(json_out_file, 'w') as json_file: +with open(json_out_file, "w") as json_file: json.dump(all_results, json_file) print(f"List dumped into {json_out_file}") diff --git a/source_collectors/muckrock/muckrock_ml_labeler.py b/source_collectors/muckrock/muckrock_ml_labeler.py index dafd6de2..46b65808 100644 --- a/source_collectors/muckrock/muckrock_ml_labeler.py +++ b/source_collectors/muckrock/muckrock_ml_labeler.py @@ -11,16 +11,22 @@ # Load the dataset from command line argument parser = argparse.ArgumentParser(description="Load CSV file into a pandas DataFrame.") -parser.add_argument('--csv_file', type=str, required=True, help="Path to the CSV file") +parser.add_argument("--csv_file", type=str, required=True, help="Path to the CSV file") args = parser.parse_args() df = pd.read_csv(args.csv_file) # Combine multiple columns (e.g., 'url', 'html_title', 'h1') into a single text field for each row -columns_to_combine = ['url_path', 'html_title', 'h1'] # Add other columns here as needed -df['combined_text'] = df[columns_to_combine].apply(lambda row: ' '.join(row.values.astype(str)), axis=1) +columns_to_combine = [ + "url_path", + "html_title", + "h1", +] # Add other columns here as needed +df["combined_text"] = df[columns_to_combine].apply( + lambda row: " ".join(row.values.astype(str)), axis=1 +) # Convert the combined text into a list -texts = df['combined_text'].tolist() +texts = df["combined_text"].tolist() # Tokenize the inputs inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") @@ -37,5 +43,5 @@ predicted_labels = [labels[int(pred)] for pred in predictions] # Add the predicted labels to the dataframe and save -df['predicted_label'] = predicted_labels +df["predicted_label"] = predicted_labels df.to_csv("labeled_muckrock_dataset.csv", index=False) diff --git a/source_collectors/muckrock/search_foia_data_db.py b/source_collectors/muckrock/search_foia_data_db.py index ff9aac68..12290591 100644 --- a/source_collectors/muckrock/search_foia_data_db.py +++ b/source_collectors/muckrock/search_foia_data_db.py @@ -1,4 +1,4 @@ -''' +""" search_foia_data_db.py This script provides search functionality for the `foia_data.db` SQLite database. The search looks in `title`s and @@ -16,8 +16,7 @@ Error Handling: Errors encountered during database operations, JSON parsing, or file writing are printed to the console. -''' - +""" import sqlite3 import pandas as pd @@ -26,37 +25,43 @@ import os from typing import Union, List, Dict -check_results_table_query = ''' +check_results_table_query = """ SELECT name FROM sqlite_master WHERE (type = 'table') AND (name = 'results') - ''' + """ -search_foia_query = ''' +search_foia_query = """ SELECT * FROM results WHERE (title LIKE ? OR tags LIKE ?) AND (status = 'done') - ''' + """ def parser_init() -> argparse.ArgumentParser: - ''' + """ Initializes the argument parser for search_foia_data_db.py. Returns: argparse.ArgumentParser: The configured argument parser. - ''' + """ parser = argparse.ArgumentParser( - description='Search foia_data.db and generate a JSON file of resulting matches') - parser.add_argument('--search_for', type=str, required=True, metavar='', - help='Provide a string to search foia_data.db') + description="Search foia_data.db and generate a JSON file of resulting matches" + ) + parser.add_argument( + "--search_for", + type=str, + required=True, + metavar="", + help="Provide a string to search foia_data.db", + ) return parser def search_foia_db(search_string: str) -> Union[pd.DataFrame, None]: - ''' + """ Searches the foia_data.db database for FOIA request entries matching the provided search string. Args: @@ -70,35 +75,35 @@ def search_foia_db(search_string: str) -> Union[pd.DataFrame, None]: Raises: sqlite3.Error: If any database operation fails, prints error and returns None. Exception: If any unexpected error occurs, prints error and returns None. - ''' + """ print(f'Searching foia_data.db for "{search_string}"...') try: - with sqlite3.connect('foia_data.db') as conn: + with sqlite3.connect("foia_data.db") as conn: results_table = pd.read_sql_query(check_results_table_query, conn) if results_table.empty: - print('The `results` table does not exist in the database.') + print("The `results` table does not exist in the database.") return None - params = [f'%{search_string}%', f'%{search_string}%'] + params = [f"%{search_string}%", f"%{search_string}%"] df = pd.read_sql_query(search_foia_query, conn, params=params) except sqlite3.Error as e: - print(f'Sqlite error: {e}') + print(f"Sqlite error: {e}") return None except Exception as e: - print(f'An unexpected error occurred: {e}') + print(f"An unexpected error occurred: {e}") return None return df def parse_communications_column(communications) -> List[Dict]: - ''' + """ Parses a communications column value, decoding it from JSON format. Args: @@ -110,19 +115,19 @@ def parse_communications_column(communications) -> List[Dict]: Raises: json.JSONDecodeError: If deserialization fails, prints error and returns empty list. - ''' + """ if pd.isna(communications): return [] try: return json.loads(communications) except json.JSONDecodeError as e: - print(f'Error decoding JSON: {e}') + print(f"Error decoding JSON: {e}") return [] def generate_json(df: pd.DataFrame, search_string: str) -> None: - ''' + """ Generates a JSON file from a pandas DataFrame. Args: @@ -136,46 +141,49 @@ def generate_json(df: pd.DataFrame, search_string: str) -> None: Raises: Exception: If writing to JSON file operation fails, prints error and returns. - ''' + """ - output_json = f'{search_string.replace(' ', '_')}.json' + output_json = f"{search_string.replace(' ', '_')}.json" try: - df.to_json(output_json, orient='records', indent=4) + df.to_json(output_json, orient="records", indent=4) print(f'Matching entries written to "{output_json}"') except Exception as e: - print(f'An error occurred while writing JSON: {e}') + print(f"An error occurred while writing JSON: {e}") def main() -> None: - ''' + """ Function to search the foia_data.db database for entries matching a specified search string. Command Line Args: --search_for (str): A string to search for in the `title` and `tags` fields of FOIA requests. - ''' + """ parser = parser_init() args = parser.parse_args() search_string = args.search_for - if not os.path.exists('foia_data.db'): - print('foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it.') + if not os.path.exists("foia_data.db"): + print( + "foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it." + ) return df = search_foia_db(search_string) if df is None: return - if not df['communications'].empty: - df['communications'] = df['communications'].apply( - parse_communications_column) + if not df["communications"].empty: + df["communications"] = df["communications"].apply(parse_communications_column) - print(f'Found {df.shape[0]} matching entries containing "{ - search_string}" in the title or tags') + print( + f'Found {df.shape[0]} matching entries containing "{ + search_string}" in the title or tags' + ) generate_json(df, search_string) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/source_collectors/muckrock/search_local_foia_json.py b/source_collectors/muckrock/search_local_foia_json.py index 9e61d49c..66e6aca6 100644 --- a/source_collectors/muckrock/search_local_foia_json.py +++ b/source_collectors/muckrock/search_local_foia_json.py @@ -1,38 +1,44 @@ import json # Specify the JSON file path -json_file = 'foia_data.json' -search_string = 'use of force' +json_file = "foia_data.json" +search_string = "use of force" # Load the JSON data -with open(json_file, 'r', encoding='utf-8') as file: +with open(json_file, "r", encoding="utf-8") as file: data = json.load(file) # List to store matching entries matching_entries = [] + # Function to search within an entry def search_entry(entry): # Check if 'status' is 'done' - if entry.get('status') != 'done': + if entry.get("status") != "done": return False - + # Check if 'title' or 'tags' field contains the search string - title_match = 'title' in entry and search_string.lower() in entry['title'].lower() - tags_match = 'tags' in entry and any(search_string.lower() in tag.lower() for tag in entry['tags']) - + title_match = "title" in entry and search_string.lower() in entry["title"].lower() + tags_match = "tags" in entry and any( + search_string.lower() in tag.lower() for tag in entry["tags"] + ) + return title_match or tags_match + # Iterate through the data and collect matching entries for entry in data: if search_entry(entry): matching_entries.append(entry) # Output the results -print(f"Found {len(matching_entries)} entries containing '{search_string}' in the title or tags.") +print( + f"Found {len(matching_entries)} entries containing '{search_string}' in the title or tags." +) # Optionally, write matching entries to a new JSON file -with open('matching_entries.json', 'w', encoding='utf-8') as file: +with open("matching_entries.json", "w", encoding="utf-8") as file: json.dump(matching_entries, file, indent=4) print(f"Matching entries written to 'matching_entries.json'") diff --git a/source_collectors/muckrock/utils.py b/source_collectors/muckrock/utils.py index ca66dc8c..3d8b63db 100644 --- a/source_collectors/muckrock/utils.py +++ b/source_collectors/muckrock/utils.py @@ -1,18 +1,17 @@ -''' +""" utils.py Provides useful functions for muckrock_tools. Functions: - format_filename_json_to_csv() -''' - +""" import re def format_filename_json_to_csv(json_filename: str) -> str: - ''' + """ Converts JSON filename format to CSV filename format. Args: @@ -21,7 +20,7 @@ def format_filename_json_to_csv(json_filename: str) -> str: Returns: csv_filename (str): A CSV filename string. - ''' - csv_filename = re.sub(r'_(?=[^.]*$)', '-', json_filename[:-5]) + '.csv' + """ + csv_filename = re.sub(r"_(?=[^.]*$)", "-", json_filename[:-5]) + ".csv" return csv_filename From e8014b8cf1ae6a9c98685067747a7d9f8a4d507c Mon Sep 17 00:00:00 2001 From: eddie-m-m Date: Fri, 15 Nov 2024 17:30:09 -0800 Subject: [PATCH 04/11] Clean up files, lint (again) --- common_crawler/README.md | 87 ----- common_crawler/__init__.py | 0 common_crawler/argparser.py | 95 ----- common_crawler/cache.py | 93 ----- common_crawler/config.ini | 19 - common_crawler/crawler.py | 139 ------- common_crawler/csv_manager.py | 73 ---- common_crawler/data/cache.json | 7 - common_crawler/data/urls.csv | 207 ----------- common_crawler/main.py | 350 ------------------ .../requirements_common_crawler_action.txt | 3 - common_crawler/utils.py | 22 -- .../convert_all_record_types_to_csv.py | 57 --- .../muckrock/download_muckrock_foia.py | 9 + .../generate_detailed_muckrock_csv.py | 5 +- .../muckrock/get_all_record_types.py | 50 --- .../muckrock/get_allegheny_foias.py | 4 + source_collectors/muckrock/muck_get.py | 5 + .../muckrock/muckrock_ml_labeler.py | 5 + .../muckrock/search_foia_data_db.py | 3 +- .../muckrock/search_local_foia_json.py | 9 +- 21 files changed, 35 insertions(+), 1207 deletions(-) delete mode 100644 common_crawler/README.md delete mode 100644 common_crawler/__init__.py delete mode 100644 common_crawler/argparser.py delete mode 100644 common_crawler/cache.py delete mode 100644 common_crawler/config.ini delete mode 100644 common_crawler/crawler.py delete mode 100644 common_crawler/csv_manager.py delete mode 100644 common_crawler/data/cache.json delete mode 100644 common_crawler/data/urls.csv delete mode 100644 common_crawler/main.py delete mode 100644 common_crawler/requirements_common_crawler_action.txt delete mode 100644 common_crawler/utils.py delete mode 100644 source_collectors/muckrock/convert_all_record_types_to_csv.py delete mode 100644 source_collectors/muckrock/get_all_record_types.py diff --git a/common_crawler/README.md b/common_crawler/README.md deleted file mode 100644 index 3701b5d5..00000000 --- a/common_crawler/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# Common Crawler - -This module interfaces with the Common Crawl dataset to extract urls. - -## Installation - -Python Version Required: 3.11 - -To install all necessary dependencies, run the following command from the root directory: - -```bash -pip install -r requirements.txt -``` - - -## Usage Example - -### Environment Requirements - -Please ensure you have a `.env` file located in the root directory (not the `common_crawler` directory) -which contains the following environment variable: - -* HUGGINGFACE_ACCESS_TOKEN = The access token to enable writing to the associated PDAP dataset. -To obtain your access token, consult user settings at -and ensure you have write access to . -* LABEL_STUDIO_ACCESS_TOKEN = The access token for the Label Studio API. This can be - obtained by logging into Label Studio and navigating to the [user account section](https://app.heartex.com/user/account), where the access token can be copied. -* LABEL_STUDIO_PROJECT_ID = The project ID for the Label Studio API. This can be - obtained by logging into Label Studio and navigating to the relevant project, where the project id will be in the URL. - -### Instructions - -Run the following script from the root directory -```bash -python common_crawler/main.py CC-MAIN-2023-50 '*.gov' police --config common_crawler/config.ini --pages 2 -``` - -This example will crawl a single page (typically 15000 records) of the Common Crawl dataset with ID `CC-MAIN-2023-50` -and search for the term `police` in all the pages with the `.gov` domain. It will use the default configuration file `config.ini` -to determine the json cache location and the location of the output csv file. - -Note that the cache records the most recent page number that was used for given combination of Common Crawl ID, url search term, and keyword. -If the same command is run again, it will start from the next page. -If you want to reset the cache, you can use the `--reset-cache` flag. - -By default, the output csv file will be named `urls.csv` and will be located in the `data` directory of the module. -This csv file contains both the url and the parameters used to query it. - -### Parameters - -- **common_crawl_id**: Required. Specifies the Common Crawl Index to perform the search on. -- **url**: Required. Specifies the domain URL to query. Wildcard characters such as * can be used to expand the search. Note that the query must be contained within quotes (as in '*.gov') to prevent misinterpretation of wildcards -- **search_term**: Required. Specifies keyword within the url to search for. -- **-c or --config**: Optional. Specifies the configuration file to use. The default value is config.ini. -- **-p or --pages**: Optional. Specifies the number of pages to search. The default value is 1. -- **--reset-cache**: Optional. If set, it resets the cache before starting the crawl. - -### Configuration - -Several attributes are currently defined in `config.ini`: -- **cache_filename**: This is the name of the cache file. The default value is `cache`. The file will be saved with a `.json` extension. -- **output_filename**: This is the name of the output file. The default value is `urls`. The file will be saved with a `.csv` extension. -- **data_dir**: This is the directory where the cache and output files will be saved. The default value is `data`. -- **huggingface_repo_id**: This is the repository ID for the hugging face dataset which urls will be uploaded to - -## Code Structure - -The code is structured as follows: -- **main.py**: This is the main file that is used to run the module. It contains the logic to parse the command line arguments and call the necessary functions. -- **crawler.py**: This file contains the logic to interface with the Common Crawl dataset and extract urls. -- **cache.py**: This file contains the logic to read and write the cache file. -- **argparser.py**: This file contains the logic to parse the command line and config arguments. -- **csv_manager.py**: This file contains the logic to write the output csv file. -- **utils.py**: This file contains utility functions. -- **config.ini**: This file contains the default configuration values. -- **README.md**: This file contains the documentation for the module. You're reading it right now. Isn't that nifty! - -## Testing - -A suite of unit and integration tests were developed for this module. - -To run the tests, run the following command from this directory: - -```bash -pytest ../tests/test_common_crawler_integration.py -pytest ../tests/test_common_crawler_unit.py -``` \ No newline at end of file diff --git a/common_crawler/__init__.py b/common_crawler/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/common_crawler/argparser.py b/common_crawler/argparser.py deleted file mode 100644 index 67f4a290..00000000 --- a/common_crawler/argparser.py +++ /dev/null @@ -1,95 +0,0 @@ -import argparse -import configparser -import re - -""" -This module contains the argument parser for command line arguments -for the Common Crawler script. -""" - - -def valid_common_crawl_id(common_crawl_id: str) -> bool: - """ - Validate the Common Crawl ID format. - The Common Crawl ID should be in the format CC-MAIN-YYYY-WW. - Args: - common_crawl_id: The Common Crawl ID to validate - Returns: - True if the Common Crawl ID is valid, False otherwise - """ - return re.match(r"CC-MAIN-\d{4}-\d{2}", common_crawl_id) is not None - - -def parse_args() -> argparse.Namespace: - """ - Parse the command line arguments for the Common Crawler script - as well as the configuration file. - Arguments parsed include: - - The Common Crawl ID - - The URL to query - - The search term - - The number of pages to search - - The configuration file (defaults to config.ini) - - A flag to reset the cache - Returns: The parsed arguments - """ - - parser = argparse.ArgumentParser( - description="Query the Common Crawl dataset and optionally save the results to a file." - ) - # Add the required arguments - parser.add_argument("common_crawl_id", type=str, help="The Common Crawl ID") - parser.add_argument("url", type=str, help="The URL to query") - parser.add_argument("keyword", type=str, help="The keyword to search in the url") - # Optional arguments for the number of pages and the output file, and a flag to reset the cache - parser.add_argument( - "-c", - "--config", - type=str, - default="config.ini", - help="The configuration file to use", - ) - parser.add_argument( - "-p", - "--pages", - type=int, - default=1, - help="The number of pages to search (default: 1)", - ) - parser.add_argument( - "--reset-cache", - action="store_true", - default=False, - help="Reset the cache before starting the crawl", - ) - - args = parser.parse_args() - - # Validate the Common Crawl ID format - if not valid_common_crawl_id(args.common_crawl_id): - parser.error( - "Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW." - ) - - # Read the configuration file - config = configparser.ConfigParser() - config.read(args.config) - - # Combine parsed arguments with configuration file defaults - app_parser = argparse.ArgumentParser(parents=[parser], add_help=False) - app_parser.set_defaults(**config["DEFAULT"]) - - app_args = app_parser.parse_args() - - # Print arguments - print(f"--Common Crawl ID: {app_args.common_crawl_id}") - print(f"--URL: {app_args.url}") - print(f"--Keyword: {app_args.keyword}") - print(f"--Number of Pages: {app_args.pages}") - print(f"--Configuration File: {app_args.config}") - print(f"--Reset Cache: {app_args.reset_cache}") - print(f"--Output File: {app_args.output_filename}.csv") - print(f"--Cache File: {app_args.cache_filename}.json") - print(f"--Data Directory: {app_args.data_dir}") - - return app_args diff --git a/common_crawler/cache.py b/common_crawler/cache.py deleted file mode 100644 index 23d58819..00000000 --- a/common_crawler/cache.py +++ /dev/null @@ -1,93 +0,0 @@ -import json - -from util.miscellaneous_functions import get_file_path - -""" -This module contains classes for managing a cache of Common Crawl search results -These classes include: - - CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results -""" - - -class CommonCrawlerCacheManager: - """ - A class for managing the cache of Common Crawl search results. - This class is responsible for adding, retrieving, and saving cache data. - """ - - def __init__(self, file_name: str = "cache", directory=None): - """ - Initializes the CacheStorage object with a file name and directory. - Args: - file_name: the name of the cache file - directory: the directory to store the cache file - """ - self.file_path = get_file_path(f"{file_name}.json", directory) - print(f"Cache file path: {self.file_path}") - self.cache = self.load_or_create_cache() - - def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None: - """ - Updates the cache with the last page crawled for a given index, url, and keyword. - Or adds a new cache object if it does not exist. - Args: - index: the index of the common crawl - url: the url to search - keyword: the search term to use - last_page: the last page crawled - Returns: None - """ - if index not in self.cache: - self.cache[index] = {} - if url not in self.cache[index]: - self.cache[index][url] = {} - self.cache[index][url][keyword] = last_page - - def get(self, index, url, keyword) -> int: - """ - Retrieves a page number from the cache. - Args: - index: the index of the common crawl - url: the url to search - keyword: the search term to use - - Returns: int - the last page crawled - - """ - if ( - index in self.cache - and url in self.cache[index] - and keyword in self.cache[index][url] - ): - return self.cache[index][url][keyword] - # The cache object does not exist. Return 0 as the default value. - return 0 - - def load_or_create_cache(self) -> dict: - """ - Loads the cache from the configured file path. - If the file does not exist, an empty dictionary is returned. - Returns: dict - the cache data - """ - try: - with open(self.file_path, "r") as file: - return json.load(file) - except FileNotFoundError: - return {} - - def save_cache(self) -> None: - """ - Converts the cache object into a JSON-serializable format and saves it to the configured file path. - This method ensures the cache is stored in a readable and easily reloadable format, allowing for - persistence of crawl data across sessions. - """ - # Reformat cache data for JSON serialization - with open(self.file_path, "w") as file: - json.dump(self.cache, file, indent=4) - - def reset_cache(self) -> None: - """ - Resets the cache to an empty state. - """ - self.cache = {} - print("Cache has been reset.") diff --git a/common_crawler/config.ini b/common_crawler/config.ini deleted file mode 100644 index fc558303..00000000 --- a/common_crawler/config.ini +++ /dev/null @@ -1,19 +0,0 @@ -# This configuration file contains default settings for the Common Crawler application. -# Settings can be modified to suit different environments or testing needs. - -[DEFAULT] -# Filename for the cache. Stores which pages have been crawled -# at which combinations of index, url search term, and keyword -# to avoid re-crawling them. -cache_filename = cache - -# Directory where data files (both cache and output) are stored. -# Change as needed for different environments. -# Path is relative from working directory that executes common_crawler/main.py -data_dir = common_crawler/data - -# Filename for the output CSV containing crawled URLs. -output_filename = urls - -# Name of the huggingface repo -huggingface_repo_id = PDAP/unlabeled-urls \ No newline at end of file diff --git a/common_crawler/crawler.py b/common_crawler/crawler.py deleted file mode 100644 index 0982ca53..00000000 --- a/common_crawler/crawler.py +++ /dev/null @@ -1,139 +0,0 @@ -import json -import time -from urllib.parse import quote_plus -from http import HTTPStatus - -import requests - -from .utils import URLWithParameters -from dataclasses import dataclass -from collections import namedtuple - -""" -This module contains classes for managing a cache of Common Crawl search results -""" - -# TODO: What happens when no results are found? How does the CommonCrawlerManager handle this? - - -@dataclass -class CommonCrawlResult: - last_page_search: int - url_results: list[str] - - -class CommonCrawlerManager: - """ - This class orchestrates the crawling process, leveraging CommonCrawler for - actual interactions with the Common Crawl Index Server and CommonCrawlerCacheManager - for caching results. - It validates crawl ids, manages pagination, and aggregates results. - """ - - def __init__(self, crawl_id="CC-MAIN-2023-50"): - self.crawl_id = crawl_id - CC_INDEX_SERVER = "http://index.commoncrawl.org/" - INDEX_NAME = f"{self.crawl_id}-index" - self.root_url = f"{CC_INDEX_SERVER}{INDEX_NAME}" - - def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResult: - print( - f"Searching for {keyword} on {search_term} in {self.crawl_id} for {num_pages} pages," - f" starting at page {start_page}" - ) - - url_results = [] - - end_page = start_page + num_pages - last_page = start_page - - for next_page in range(start_page, end_page): - records = self.search_common_crawl_index(search_term, next_page) - - # If records were found, filter them and add to results - if not records: - continue - - keyword_urls = self.get_urls_with_keyword(records, keyword) - url_results.extend(keyword_urls) - - last_page = next_page - - # Wait 5 seconds before making the next request, to avoid overloading the server - time.sleep(5) - - return CommonCrawlResult(last_page, url_results) - - def search_common_crawl_index( - self, url: str, page: int = 0, max_retries: int = 20 - ) -> list[dict]: - """ - This method is used to search the Common Crawl index for a given URL and page number - Args: - url: a URL to search for - page: the page number to search - - Returns: A list of records (dictionaries) containing the search results - - """ - encoded_url = quote_plus(url) - search_url = URLWithParameters(self.root_url) - search_url.add_parameter("url", encoded_url) - search_url.add_parameter("output", "json") - search_url.add_parameter("page", page) - - retries = 0 - delay = 1 - - # put HTTP GET request in re-try loop in case of rate limiting. Once per second is nice enough per common crawl doc. - while retries < max_retries: - response = self.make_request(search_url) - if response: - return self.process_response(response, url, page) - - retries += 1 - print( - f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})" - ) - time.sleep(delay) - - print(f"Max retries exceeded. Failed to get records for {url} on page {page}.") - return None - - def make_request(self, search_url: str) -> requests.Response: - """ - Makes the HTTP GET request to the given search URL. - Return the response if successful, None if rate-limited. - """ - try: - response = requests.get(str(search_url)) - response.raise_for_status() - return response - except requests.exceptions.RequestException as e: - if ( - response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR - and "SlowDown" in response.text - ): - return None - else: - print(f"Failed to get records: {e}") - return None - - def process_response( - self, response: requests.Response, url: str, page: int - ) -> list[dict]: - """Processes the HTTP response and returns the parsed records if successful.""" - if response.status_code == HTTPStatus.OK: - records = response.text.strip().split("\n") - print(f"Found {len(records)} records for {url} on page {page}") - return [json.loads(record) for record in records] - elif "First Page is 0, Last Page is 0" in response.text: - print("No records exist in index matching the url search term") - return None - else: - print(f"Unexpected response: {response.status_code}") - return None - - @staticmethod - def get_urls_with_keyword(records: list[dict], keyword) -> list[str]: - return [record["url"] for record in records if keyword in record["url"]] diff --git a/common_crawler/csv_manager.py b/common_crawler/csv_manager.py deleted file mode 100644 index 2b823b42..00000000 --- a/common_crawler/csv_manager.py +++ /dev/null @@ -1,73 +0,0 @@ -import csv -import os - -from util.miscellaneous_functions import get_file_path - - -class CSVManager: - """ - Manages a CSV file for storing URLs. - Creates the file if it doesn't exist, and provides a method for adding new rows. - """ - - def __init__(self, file_name: str, headers: list[str], directory=None): - self.file_path = get_file_path(f"{file_name}.csv", directory) - self.headers = headers - if not os.path.exists(self.file_path): - self.initialize_file() - - def add_row(self, row_values: list[str] | tuple[str]): - """ - Appends a new row of data to the CSV. - Args: - row_values: list of values to add to the csv, in order of their inclusion in the list - """ - if isinstance(row_values, str): - # Single values must be converted to a list format - row_values = [row_values] - try: - with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: - writer = csv.writer(file) - writer.writerow(row_values) - except Exception as e: - print(f"An error occurred while trying to write to {self.file_path}: {e}") - - def add_rows(self, results: list[list[str]]) -> None: - """ - Appends multiple rows of data to the CSV as a list of lists of strings. - Args: - results: list[list[str] - a list of lists of strings, each inner list representing a row - Returns: None - """ - for result in results: - self.add_row(result) - print(f"{len(results)} URLs written to {self.file_path}") - - def initialize_file(self): - """ - Initializes the CSV file. - If the file doesn't exist, it creates it with the header row. - """ - # check if file exists - file_exists = os.path.isfile(self.file_path) - - if not file_exists: - with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: - writer = csv.writer(file) - writer.writerow(self.headers) - else: - # Open and check that headers match - with open(self.file_path, mode="r", encoding="utf-8") as file: - header_row = next(csv.reader(file)) - if header_row != self.headers: - raise ValueError( - f"Header row in {self.file_path} does not match expected headers" - ) - print(f"CSV file initialized at {self.file_path}") - - def delete_file(self): - """ - Deletes the CSV file. - """ - os.remove(self.file_path) - print(f"CSV file deleted at {self.file_path}") diff --git a/common_crawler/data/cache.json b/common_crawler/data/cache.json deleted file mode 100644 index e12687ad..00000000 --- a/common_crawler/data/cache.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "CC-MAIN-2023-50": { - "*.gov": { - "police": 10 - } - } -} \ No newline at end of file diff --git a/common_crawler/data/urls.csv b/common_crawler/data/urls.csv deleted file mode 100644 index 6fc4dc6f..00000000 --- a/common_crawler/data/urls.csv +++ /dev/null @@ -1,207 +0,0 @@ -Index,Search Term,Keyword,Page,URL -CC-MAIN-2023-50,*.gov,police,2,https://acworth-ga.gov/administering-the-oath-of-office-to-a-newly-promoted-member-of-the-police-department/ -CC-MAIN-2023-50,*.gov,police,2,https://www.ada.gov/policevideo/policebroadbandgallery.htm -CC-MAIN-2023-50,*.gov,police,2,https://archive.ada.gov/franklintonpolice.htm -CC-MAIN-2023-50,*.gov,police,2,https://archive.ada.gov/illinois_state_police.htm -CC-MAIN-2023-50,*.gov,police,2,https://www.adamn.gov/p/other/police-department -CC-MAIN-2023-50,*.gov,police,2,https://www.adamscountypa.gov/police/earpd -CC-MAIN-2023-50,*.gov,police,2,https://www.aftonwyoming.gov/government/police_department/index.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/community_relations.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/community_relations.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/crime_snapshot_statistics.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/crime_snapshot_statistics.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/index.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/index.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/investigative_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/investigative_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/procedures.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/procedures.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/recruiting/index.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/recruiting/index.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/services_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/services_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/transparency_hub.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/transparency_hub.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/uniform_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/uniform_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/zone_command.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/zone_command.php -CC-MAIN-2023-50,*.gov,police,6,https://adeca.alabama.gov/2022/11/14/gov-ivey-announces-grant-to-help-auburn-police-deter-crime/ -CC-MAIN-2023-50,*.gov,police,7,https://governor.alabama.gov/newsroom/2020/02/kimberly-police-officer-nick-orear-flag-memo/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/de/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ko/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ko/sales-use/police-jurisdictions/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ru/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2015-police-jurisdiction-annexations-deannexations-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2023-police-jurisdiction-deannexations-ordinances-and-maps/ -CC-MAIN-2023-50,*.gov,police,8,https://tourism.alabama.gov/tag/world-police-and-fire-games/ -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/content/public/departments/police-department/community_resources_apd.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/content/public/v/237/departments/police-department/community_resources_apd.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/sharedassets/public/alameda/police/policy-manual.pdf -CC-MAIN-2023-50,*.gov,police,8,http://alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf -CC-MAIN-2023-50,*.gov,police,8,https://alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/departments/police/ -CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/news/stories/peace-officers-memorial-day-and-national-police-week/ -CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/public-safety/police/police-blotter/ -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/airport-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/airport-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/business/policefire/index.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/business/policefire/jobs/ -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/contact-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/contact-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police-fire-organization-chart.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police-fire-organization-chart.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/faiiap/police-fire/index.shtml -CC-MAIN-2023-50,*.gov,police,10,https://gov.alaska.gov/a-proclamation-on-honoring-united-states-capitol-police-officers/ -CC-MAIN-2023-50,*.gov,police,10,https://geohub.albanyga.gov/datasets/corrected-police-beat -CC-MAIN-2023-50,*.gov,police,10,https://data.albanyny.gov/browse?tags=police+report -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/departments/police/contact-the-albany-police-department -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/departments/police/programs/medication-and-sharps-disposal -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/hr/salary-schedules/police-table -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/apba/scholarship_packet.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/a18_alarm_user_permit_application.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/secondhand_dealer.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/Solicitor_License.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/neighborhood-watch/2013_nw_brochure-update.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/property/propertyinventoryrecord-fillable.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/child_safety_smartcard.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/facebook_smart_card.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/linkedln_smart_card.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/photosharingservices_smartcard.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/smartphone_smartcard.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/twitter_smart_card.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/ -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/accreditation -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/accreditation -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/administration -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/apd-policies -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/apd-policies -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/communications-section -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/communications-section -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/history -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/operations -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/operations -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/quarterly-report -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/quarterly-report -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/records-section -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/cold-cases -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/2dogs -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/2dogs -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/apbascholarship -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/apbascholarship -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/filing-a-complaint -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/filing-a-complaint -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/home-security-alarm-permits -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/home-security-alarm-permits -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/patch-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/patch-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/property-inventory-record -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/ride-along-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/ride-along-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/animal-control -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/apba -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/community-police-academy -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/medication-and-sharps-disposal -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/neighborhood-speed-watch -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/programs/neighborhood-speed-watch -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/neighborhood-watch-program -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/resources -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/resources -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safe-and-secure-seniors-independent -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safe-and-secure-seniors-independent -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safereturn -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safety-camp -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safety-camp -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/tow -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/tow -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/victim-assistance -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/victim-assistance -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/youthacademy -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/qrcode -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/robots.txt -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/bicycle-theft-prevention-and-safety -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/bicycle-theft-prevention-and-safety -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/child-safety -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/crime-prevention-through-environmental-design-cpted -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/crime-prevention-through-environmental-design-cpted -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/online-social-media-safety-tips -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/protecting-your-business -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/safe-exchange-zones -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/safety-on-the-road -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/vehicle -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/cadet-program -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/career-opportunities -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/lateral-officers -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/volunteer-program -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/volunteer-program -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-02-22/alexandria-police-department-makes-arrest-in-connection-to-shots-fired-incident -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-03-15/alexandria-police-department-apprehends-assault-suspect -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-03-22/alexandria-police-officer-arrested -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-03-25/alexandria-police-department-investigates-first-homicide-of-the-year -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-04-18/don-hayes-appointed-alexandria-police-chief -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-06-06/alexandria-police-makes-arrest-in-fatal-shooting -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-08-29/alexandria-police-department-investigates-serious-crash -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-12-21/alexandria-police-department-investigates-shooting-incident -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2023-09-29/apd-lt-graduates-from-dc-police-leadership-academy -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2023-11-17/apd-assists-fairfax-county-police-in-apprehension-of-suspect-driving-stolen -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2023-11-17/apd-assists-fairfax-county-police-in-apprehension-of-suspect-driving-stolen -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/ -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/community-police-academy -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/criminal-investigation-division -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/listing-page/apd-news-releases -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/office-of-the-police-chief -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/other-services -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/police-services -CC-MAIN-2023-50,*.gov,police,11,http://www3.alexandriava.gov/police/crime_reports/reporter.php -CC-MAIN-2023-50,*.gov,police,11,https://www3.alexandriava.gov/police/crime_reports/reporter.php -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police/info/default.aspx?id=112991 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/default.aspx?id=24274 -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police/info/default.aspx?id=59358 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=27648 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=33624 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=68136 -CC-MAIN-2023-50,*.gov,police,11,https://wdc.alexandriava.gov/employment/special-police-officer-listing-3030.aspx -CC-MAIN-2023-50,*.gov,police,11,https://wdc.alexandriava.gov/employment/special-police-officer-listing-4122.aspx -CC-MAIN-2023-50,*.gov,police,11,https://aliquippapa.gov/events/light-up-night-at-the-aliquippa-police-station/ -CC-MAIN-2023-50,*.gov,police,11,https://www.almaarkansas.gov/police/ -CC-MAIN-2023-50,*.gov,police,11,https://www.almontmichigan.gov/departments/police-department/ -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/contact-forms/departments/police/report-an-abandoned-vehicle-on-public-streets -CC-MAIN-2023-50,*.gov,police,11,https://www.altoonapa.gov/contacts/police/commander-of-criminal-investigation/lt-ashley-day -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/animal-control -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/directory -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/services -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/police-documents/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/police-staff/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/question/how-do-i-file-a-police-report-2/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/question/who-do-i-call-about-police-related-non-emergencies-2/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/topics/police-courts/ -CC-MAIN-2023-50,*.gov,police,11,http://police.amarillo.gov/robots.txt -CC-MAIN-2023-50,*.gov,police,11,http://police.amarillo.gov/robots.txt -CC-MAIN-2023-50,*.gov,police,11,https://share.america.gov/ar/heres-police-held-accountable-shooting-incidents-video/ diff --git a/common_crawler/main.py b/common_crawler/main.py deleted file mode 100644 index b9dd012f..00000000 --- a/common_crawler/main.py +++ /dev/null @@ -1,350 +0,0 @@ -import argparse -import collections -import dataclasses -import re -import sys -import os -from datetime import datetime - -from dotenv import load_dotenv - -# The below code sets the working directory to be the root of the entire repository -# This is done to solve otherwise quite annoying import issues. -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) - -from util.huggingface_api_manager import HuggingFaceAPIManager -from util.miscellaneous_functions import get_filename_friendly_timestamp -from common_crawler.argparser import parse_args -from common_crawler.cache import CommonCrawlerCacheManager -from common_crawler.crawler import CommonCrawlerManager, CommonCrawlResult -from common_crawler.csv_manager import CSVManager -from label_studio_interface.LabelStudioConfig import LabelStudioConfig -from label_studio_interface.LabelStudioAPIManager import LabelStudioAPIManager - -""" -This module contains the main function for the Common Crawler script. -""" - - -@dataclasses.dataclass -class BatchInfo: - datetime: str - source: str - count: str - keywords: str - notes: str - filename: str - - -class LabelStudioError(Exception): - """Custom exception for Label Studio Errors""" - - pass - - -BATCH_HEADERS = ["Datetime", "Source", "Count", "Keywords", "Notes", "Filename"] - - -def get_current_time(): - return str(datetime.now()) - - -def add_batch_info_to_csv( - common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int -) -> BatchInfo: - batch_info = BatchInfo( - datetime=get_current_time(), - source="Common Crawl", - count=str(len(common_crawl_result.url_results)), - keywords=f"{args.url} - {args.keyword}", - notes=f"{args.common_crawl_id}, {args.pages} pages, starting at {last_page + 1}", - filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}", - ) - - batch_info_csv_manager = CSVManager( - file_name="batch_info", directory=args.data_dir, headers=BATCH_HEADERS - ) - batch_info_csv_manager.add_row(dataclasses.astuple(batch_info)) - - return batch_info - - -def main(): - # Parse the arguments - args = parse_args() - - # Initialize the Cache - cache_manager = CommonCrawlerCacheManager( - file_name=args.cache_filename, directory=args.data_dir - ) - - load_dotenv() - - # Initialize the HuggingFace API Manager - hf_access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN") - if not hf_access_token: - raise ValueError( - "HUGGINGFACE_ACCESS_TOKEN not accessible in .env file in root directory. " - "Please obtain access token from your personal account at " - "https://huggingface.co/settings/tokens and ensure you have write access to " - "https://huggingface.co/PDAP. Then include in .env file in root directory." - ) - huggingface_api_manager = HuggingFaceAPIManager( - access_token=hf_access_token, repo_id=args.huggingface_repo_id - ) - ls_access_token = os.getenv("LABEL_STUDIO_ACCESS_TOKEN") - if not ls_access_token: - raise ValueError( - "LABEL_STUDIO_ACCESS_TOKEN not accessible in .env file in root directory. " - "Please obtain access token from your personal account at " - "https://app.heartex.com/user/account and ensure you have read access to " - "https://app.heartex.com/projects/61550. Then include in .env file in root directory." - ) - ls_project_id = os.getenv("LABEL_STUDIO_PROJECT_ID") - if not ls_project_id: - raise ValueError( - "LABEL_STUDIO_PROJECT_ID not accessible in .env file in root directory. " - "Please obtain a project ID by navigating to the Label Studio project " - "where it will be visibile in the url. Then include in .env file in root directory." - ) - - try: - print("Retrieving Label Studio data for deduplication") - label_studio_results = get_ls_data() - if label_studio_results is None: - raise LabelStudioError("Failed to retrieve Label Studio Data") - print("Label Studio data retrieved successfully") - except LabelStudioError as e: - print(e) - raise - - if args.reset_cache: - cache_manager.reset_cache() - - try: - # Retrieve the last page from the cache, or 0 if it does not exist - last_page = cache_manager.get(args.common_crawl_id, args.url, args.keyword) - common_crawl_result = process_crawl_and_upload( - args, last_page, huggingface_api_manager, label_studio_results - ) - except ValueError as e: - print(f"Error during crawling: {e}") - return - - try: - cache_manager.upsert( - index=args.common_crawl_id, - url=args.url, - keyword=args.keyword, - last_page=common_crawl_result.last_page_search, - ) - cache_manager.save_cache() - - except ValueError as e: - print(f"Error while saving cache manager: {e}") - - -def handle_remote_results_error(remote_results): - """ - Handles errors in the remote results - - Args: remote_results (dict): The results from the label studio project - Raises: LabelStudioError: If an error is found in the remote results - """ - - status_code = remote_results.get("status_code") - if status_code == 401: - raise LabelStudioError("Invalid Label Studio token passed! Exiting...") - elif status_code == 404: - raise LabelStudioError("Invalid Label Studio Project ID! Exiting...") - else: - raise LabelStudioError(f"Unexpected error: {remote_results}") - - -def validate_remote_results(remote_results): - """ - Validates the remote results retrieved from the Label Studio project - - Args: remote_results (dict or list): The results from the Label Studio project - - Returns: - list[dict]: If the remote results are valid - None: If the remote results are invalid - """ - if isinstance(remote_results, list): - if not remote_results: - print("No data in Label Studio project.") - return [] - elif "url" not in remote_results[0]["data"]: - raise LabelStudioError( - "Column 'url' not present in Label Studio project. Exiting..." - ) - else: - return remote_results - elif isinstance(remote_results, dict): - handle_remote_results_error(remote_results) - else: - raise LabelStudioError("Unexpected response type.") - - -def get_ls_data() -> list[dict] | None: - """Retrieves data from a Label Studio project to be used in deduplication of common crawl results. - - Returns: - list[dict] | None: Data from the Labels Studio project or None if the result is invalid. - """ - # Retrieve the data from the Labels Studio project - config = LabelStudioConfig() - api_manager = LabelStudioAPIManager(config) - response = api_manager.export_tasks_from_project(all_tasks=True) - remote_results = response.json() - - return validate_remote_results(remote_results) - - -def strip_url(url: str) -> str: - """Strips http(s)://www. from the beginning of a url if applicable. - - Args: - url (str): The URL to strip. - - Returns: - str: The stripped URL. - """ - result = re.search(r"^(?:https?://)?(?:www\.)?(.*)$", url).group(1) - return result - - -def remove_local_duplicates(url_results: list[str]) -> list[str]: - """Removes duplicate URLs from a list, ignoring http(s)://www. - - Args: - url_results (list[str]): List of URLs to deduplicate. - - Returns: - list[str]: List of unique URLs. - """ - stripped_url_results = [strip_url(url) for url in url_results] - unique_urls = collections.deque() - adjust = 0 - - for index, url in enumerate(stripped_url_results): - if url in unique_urls: - del url_results[index - adjust] - adjust += 1 - else: - unique_urls.appendleft(url) - - return url_results - - -def remove_remote_duplicates( - url_results: list[str], label_studio_data: list[dict] -) -> list[str]: - """Removes URLs from a list that are already present in the Label Studio project, ignoring http(s)://www. - - Args: - url_results (list[str]): List of URLs to deduplicate. - label_studio_data (list[dict]): Label Studio project data to check for duplicates. - - Returns: - list[str]: List of remaining URLs not present in the Label Studio project. - """ - try: - remote_urls = [strip_url(task["data"]["url"]) for task in label_studio_data] - except TypeError: - print( - "Invalid Label Studio credentials. Database could not be checked for duplicates." - ) - return url_results - remote_urls = set(remote_urls) - - stripped_url_results = [strip_url(url) for url in url_results] - adjust = 0 - - for index, url in enumerate(stripped_url_results): - if url in remote_urls: - del url_results[index - adjust] - adjust += 1 - - return url_results - - -def handle_csv_and_upload( - common_crawl_result: CommonCrawlResult, - huggingface_api_manager: HuggingFaceAPIManager, - args: argparse.Namespace, - last_page: int, -): - """ - Handles the CSV file and uploads it to Hugging Face repository. - Args: - common_crawl_result: The result from Common Crawl. - huggingface_api_manager: The Hugging Face API manager. - args: The command-line arguments. - last_page: last page crawled - - """ - batch_info = add_batch_info_to_csv(common_crawl_result, args, last_page) - - csv_manager = CSVManager( - file_name=batch_info.filename, headers=["url"], directory=args.data_dir - ) - csv_manager.add_rows(common_crawl_result.url_results) - huggingface_api_manager.upload_file( - local_file_path=csv_manager.file_path, - repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}", - ) - print( - f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}" - ) - csv_manager.delete_file() - - -def process_crawl_and_upload( - args: argparse.Namespace, - last_page: int, - huggingface_api_manager: HuggingFaceAPIManager, - label_studio_data: list[dict], -) -> CommonCrawlResult: - # Initialize the CommonCrawlerManager - crawler_manager = CommonCrawlerManager(args.common_crawl_id) - # Determine the pages to search, based on the last page searched - start_page = last_page + 1 - # Use the parsed arguments - common_crawl_result: CommonCrawlResult = crawler_manager.crawl( - search_term=args.url, - keyword=args.keyword, - num_pages=args.pages, - start_page=start_page, - ) - # Logic should conclude here if no results are found - if not common_crawl_result.url_results: - print("No url results found. Ceasing main execution.") - add_batch_info_to_csv(common_crawl_result, args, last_page) - return common_crawl_result - - print("Removing urls already in the database") - common_crawl_result.url_results = remove_local_duplicates( - common_crawl_result.url_results - ) - common_crawl_result.url_results = remove_remote_duplicates( - common_crawl_result.url_results, label_studio_data - ) - if not common_crawl_result.url_results: - print( - "No urls not already present in the database found. Ceasing main execution." - ) - add_batch_info_to_csv(common_crawl_result, args, last_page) - return common_crawl_result - - handle_csv_and_upload(common_crawl_result, huggingface_api_manager, args, last_page) - - return common_crawl_result - - -if __name__ == "__main__": - # Example usage: python main.py CC-MAIN-2023-50 *.gov "police" - # Usage with optional arguments: python main.py CC-MAIN-2023-50 *.gov "police" -p 2 -o police_urls.txt - print("Running Common Crawler...") - main() diff --git a/common_crawler/requirements_common_crawler_action.txt b/common_crawler/requirements_common_crawler_action.txt deleted file mode 100644 index 22823fd0..00000000 --- a/common_crawler/requirements_common_crawler_action.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests~=2.31.0 -python-dotenv~=1.0.1 -huggingface-hub~=0.22.2 \ No newline at end of file diff --git a/common_crawler/utils.py b/common_crawler/utils.py deleted file mode 100644 index 3cea7af2..00000000 --- a/common_crawler/utils.py +++ /dev/null @@ -1,22 +0,0 @@ -""" -This module contains utility functions for the common_crawler package -""" - - -class URLWithParameters: - """ - A class to handle URLs with parameters, allowing for easy addition of parameters - """ - - def __init__(self, url): - self.url = url - - def add_parameter(self, parameter, value): - if "?" in self.url: - self.url += f"&{parameter}={value}" - else: - self.url += f"?{parameter}={value}" - return self.url - - def __str__(self): - return self.url diff --git a/source_collectors/muckrock/convert_all_record_types_to_csv.py b/source_collectors/muckrock/convert_all_record_types_to_csv.py deleted file mode 100644 index 30acdbbe..00000000 --- a/source_collectors/muckrock/convert_all_record_types_to_csv.py +++ /dev/null @@ -1,57 +0,0 @@ -# import subprocess -# import os - -record_types = [ - "accident reports", - "arrest records", - "calls for service", - "car gps", - "citations", - "dispatch logs", - "dispatch recordings", - "field contacts", - "incident reports", - "misc police activity", - "officer involved shootings", - "stops", - "surveys", - "use of force reports", - "vehicle pursuits", - "complaints and misconduct", - "daily activity logs", - "training and hiring info", - "personnel records", - "annual and monthly reports", - "budgets and finances", - "contact info and agency meta", - "geographic", - "list of data sources", - "policies and contracts", - "crime maps and reports", - "crime statistics", - "media bulletins", - "records request info", - "resources", - "sex offender registry", - "wanted persons", - "booking reports", - "court cases", - "incarceration records", -] - -print(len(record_types)) -# json_files = [] - - -# for record_type in record_types: -# json_file = record_type.replace(' ', '_') + '.json' -# json_files.append(json_file) - -# for json_file in json_files: -# command = ['python', 'generate_detailed_muckrock_csv.py', -# '--json_file', json_file] - -# try: -# subprocess.run(command, check=True) -# except subprocess.CalledProcessError as e: -# print(f'An error occurred while processing "{json_file}": {e}') diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py index 86ede5d9..2ae99ee6 100644 --- a/source_collectors/muckrock/download_muckrock_foia.py +++ b/source_collectors/muckrock/download_muckrock_foia.py @@ -1,3 +1,12 @@ +""" +***DEPRECATED*** + +download_muckrock_foia.py + +This script fetches data from the MuckRock FOIA API and stores the results in a JSON file. + +""" + import requests import csv import time diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index 455084a7..c9f188b3 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -69,8 +69,9 @@ def get_agency(agency_id): def get_jurisdiction(jurisdiction_id): if jurisdiction_id: - jurisdiction_url = f"https://www.muckrock.com/api_v1/jurisdiction/{ - jurisdiction_id}/" + jurisdiction_url = ( + f"https://www.muckrock.com/api_v1/jurisdiction/{jurisdiction_id}/" + ) response = requests.get(jurisdiction_url) if response.status_code == 200: diff --git a/source_collectors/muckrock/get_all_record_types.py b/source_collectors/muckrock/get_all_record_types.py deleted file mode 100644 index 6fa955d2..00000000 --- a/source_collectors/muckrock/get_all_record_types.py +++ /dev/null @@ -1,50 +0,0 @@ -import subprocess - -record_types = [ - "accident reports", - "arrest records", - "calls for service", - "car gps", - "citations", - "dispatch logs", - "dispatch recordings", - "field contacts", - "incident reports", - "misc police activity", - "officer involved shootings", - "stops", - "surveys", - "use of force reports", - "vehicle pursuits", - "complaints and misconduct", - "daily activity logs", - "training and hiring info", - "personnel records", - "annual and monthly reports", - "budgets and finances", - "contact info and agency meta", - "geographic", - "list of data sources", - "policies and contracts", - "crime maps and reports", - "crime statistics", - "media bulletins", - "records request info", - "resources", - "sex offender registry", - "wanted persons", - "booking reports", - "court cases", - "incarceration records", -] - -for record_type in record_types: - command = ["python", "search_foia_data_db.py", "--search_for", record_type] - - try: - subprocess.run(command, check=True) - except subprocess.CalledProcessError as e: - print( - f'An error occurred while executing the command for "{ - record_type}": {e}' - ) diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py index 02f7a4ec..da2baf74 100644 --- a/source_collectors/muckrock/get_allegheny_foias.py +++ b/source_collectors/muckrock/get_allegheny_foias.py @@ -1,3 +1,7 @@ +""" +get_allegheny_foias.py + +""" import requests import json import time diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py index a0160a86..20c29338 100644 --- a/source_collectors/muckrock/muck_get.py +++ b/source_collectors/muckrock/muck_get.py @@ -1,3 +1,8 @@ +""" +muck_get.py + +""" + import requests import json diff --git a/source_collectors/muckrock/muckrock_ml_labeler.py b/source_collectors/muckrock/muckrock_ml_labeler.py index 46b65808..b313c045 100644 --- a/source_collectors/muckrock/muckrock_ml_labeler.py +++ b/source_collectors/muckrock/muckrock_ml_labeler.py @@ -1,3 +1,8 @@ +""" +muckrock_ml_labeler.py + +""" + from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import pandas as pd diff --git a/source_collectors/muckrock/search_foia_data_db.py b/source_collectors/muckrock/search_foia_data_db.py index 12290591..e7550608 100644 --- a/source_collectors/muckrock/search_foia_data_db.py +++ b/source_collectors/muckrock/search_foia_data_db.py @@ -178,8 +178,7 @@ def main() -> None: df["communications"] = df["communications"].apply(parse_communications_column) print( - f'Found {df.shape[0]} matching entries containing "{ - search_string}" in the title or tags' + f'Found {df.shape[0]} matching entries containing "{search_string}" in the title or tags' ) generate_json(df, search_string) diff --git a/source_collectors/muckrock/search_local_foia_json.py b/source_collectors/muckrock/search_local_foia_json.py index 66e6aca6..1e5702d9 100644 --- a/source_collectors/muckrock/search_local_foia_json.py +++ b/source_collectors/muckrock/search_local_foia_json.py @@ -1,3 +1,10 @@ +""" +***DEPRECATED*** + +search_local_foia_json.py + +""" + import json # Specify the JSON file path @@ -41,4 +48,4 @@ def search_entry(entry): with open("matching_entries.json", "w", encoding="utf-8") as file: json.dump(matching_entries, file, indent=4) -print(f"Matching entries written to 'matching_entries.json'") +print("Matching entries written to 'matching_entries.json'") From 9904e9e93e94c44e1d247165b4ab9f2b5e2401c9 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 12 Dec 2024 09:29:44 -0500 Subject: [PATCH 05/11] Add todo to main --- .github/workflows/common_crawler.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/common_crawler.yaml b/.github/workflows/common_crawler.yaml index 734bb94d..52b4007d 100644 --- a/.github/workflows/common_crawler.yaml +++ b/.github/workflows/common_crawler.yaml @@ -23,17 +23,17 @@ jobs: - name: Upgrade pip run: python -m pip install --upgrade pip - name: Install dependencies - run: pip install -r common_crawler/requirements_common_crawler_action.txt + run: pip install -r source_collectors/common_crawler/requirements_common_crawler_action.txt - name: Run script - run: python common_crawler/main.py CC-MAIN-2024-10 *.gov police --config common_crawler/config.ini --pages 20 + run: python source_collectors/common_crawler/main.py CC-MAIN-2024-10 *.gov police --config source_collectors/common_crawler/config.ini --pages 20 - name: Configure Git run: | git config --local user.email "action@github.com" git config --local user.name "GitHub Action" - name: Add common_crawler cache and common_crawler batch_info run: | - git add common_crawler/data/cache.json - git add common_crawler/data/batch_info.csv + git add source_collectors/common_crawler/data/cache.json + git add source_collectors/common_crawler/data/batch_info.csv - name: Commit changes run: git commit -m "Update common_crawler cache and batch_info" - name: Push changes From c3e6864cedfa20e9c5262ea1bd06e94f72241aff Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 12 Dec 2024 09:29:44 -0500 Subject: [PATCH 06/11] Adjust common crawler paths in Github Actions --- .github/workflows/common_crawler.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/common_crawler.yaml b/.github/workflows/common_crawler.yaml index 734bb94d..52b4007d 100644 --- a/.github/workflows/common_crawler.yaml +++ b/.github/workflows/common_crawler.yaml @@ -23,17 +23,17 @@ jobs: - name: Upgrade pip run: python -m pip install --upgrade pip - name: Install dependencies - run: pip install -r common_crawler/requirements_common_crawler_action.txt + run: pip install -r source_collectors/common_crawler/requirements_common_crawler_action.txt - name: Run script - run: python common_crawler/main.py CC-MAIN-2024-10 *.gov police --config common_crawler/config.ini --pages 20 + run: python source_collectors/common_crawler/main.py CC-MAIN-2024-10 *.gov police --config source_collectors/common_crawler/config.ini --pages 20 - name: Configure Git run: | git config --local user.email "action@github.com" git config --local user.name "GitHub Action" - name: Add common_crawler cache and common_crawler batch_info run: | - git add common_crawler/data/cache.json - git add common_crawler/data/batch_info.csv + git add source_collectors/common_crawler/data/cache.json + git add source_collectors/common_crawler/data/batch_info.csv - name: Commit changes run: git commit -m "Update common_crawler cache and batch_info" - name: Push changes From 46afa8fa07c1c04ffa2f8097eaf7f0311166dcff Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 12 Dec 2024 14:37:03 -0500 Subject: [PATCH 07/11] Update Python checks --- .github/workflows/python_checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_checks.yml b/.github/workflows/python_checks.yml index 4efaf15d..6a5c642d 100644 --- a/.github/workflows/python_checks.yml +++ b/.github/workflows/python_checks.yml @@ -18,5 +18,5 @@ jobs: uses: reviewdog/action-flake8@v3 with: github_token: ${{ secrets.GITHUB_TOKEN }} - flake8_args: --ignore E501,W291,W293,D401,D400,E402,E302,D200,D202,D205 + flake8_args: --ignore E501,W291,W293,D401,D400,E402,E302,D200,D202,D205,W503,E203 level: warning From c56828181b1c86183fbb2b101f2d2bf708e41861 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 12 Dec 2024 14:37:12 -0500 Subject: [PATCH 08/11] Add docstrings --- source_collectors/ckan/ckan_scraper_toolkit.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/source_collectors/ckan/ckan_scraper_toolkit.py b/source_collectors/ckan/ckan_scraper_toolkit.py index 5898c9f0..b441c039 100644 --- a/source_collectors/ckan/ckan_scraper_toolkit.py +++ b/source_collectors/ckan/ckan_scraper_toolkit.py @@ -17,6 +17,9 @@ @dataclass class Package: + """ + A class representing a CKAN package (dataset). + """ base_url: str = "" url: str = "" title: str = "" @@ -28,6 +31,9 @@ class Package: source_last_updated: str = "" def to_dict(self): + """ + Returns a dictionary representation of the package. + """ return { "source_url": self.url, "submitted_name": self.title, From 0b0e91dbdab63eadfaac3201b7d7f077e2d0871f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 12 Dec 2024 14:37:20 -0500 Subject: [PATCH 09/11] Remove unused import --- source_collectors/ckan/scrape_ckan_data_portals.py | 1 - 1 file changed, 1 deletion(-) diff --git a/source_collectors/ckan/scrape_ckan_data_portals.py b/source_collectors/ckan/scrape_ckan_data_portals.py index 57bd9927..ae54dd59 100644 --- a/source_collectors/ckan/scrape_ckan_data_portals.py +++ b/source_collectors/ckan/scrape_ckan_data_portals.py @@ -1,7 +1,6 @@ """Retrieves packages from CKAN data portals and parses relevant information then outputs to a CSV file""" from itertools import chain -import json import sys from typing import Any, Callable, Optional From 8eea3b413e89548364cdd3e19aee5211209bda05 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 12 Dec 2024 15:02:10 -0500 Subject: [PATCH 10/11] Add docstrings --- .../ckan/scrape_ckan_data_portals.py | 3 +++ source_collectors/common_crawler/crawler.py | 22 +++++++++++++++++++ .../common_crawler/csv_manager.py | 6 +++++ source_collectors/common_crawler/main.py | 15 +++++++++++++ source_collectors/common_crawler/utils.py | 9 ++++++++ .../muckrock/download_muckrock_foia.py | 4 +++- .../generate_detailed_muckrock_csv.py | 7 +++++- .../muckrock/get_allegheny_foias.py | 12 +++++++--- .../muckrock/search_local_foia_json.py | 4 +++- 9 files changed, 76 insertions(+), 6 deletions(-) diff --git a/source_collectors/ckan/scrape_ckan_data_portals.py b/source_collectors/ckan/scrape_ckan_data_portals.py index ae54dd59..2bb7733b 100644 --- a/source_collectors/ckan/scrape_ckan_data_portals.py +++ b/source_collectors/ckan/scrape_ckan_data_portals.py @@ -247,6 +247,9 @@ def get_supplying_entity(result: dict[str, Any]) -> str: def main(): + """ + Main function. + """ results = [] print("Gathering results...") diff --git a/source_collectors/common_crawler/crawler.py b/source_collectors/common_crawler/crawler.py index 0982ca53..b77e670f 100644 --- a/source_collectors/common_crawler/crawler.py +++ b/source_collectors/common_crawler/crawler.py @@ -18,6 +18,12 @@ @dataclass class CommonCrawlResult: + """ + A class to hold the results of a Common Crawl search. + Args: + last_page_search: the last page searched + url_results: the list of URLs found in the search + """ last_page_search: int url_results: list[str] @@ -31,12 +37,25 @@ class CommonCrawlerManager: """ def __init__(self, crawl_id="CC-MAIN-2023-50"): + """ + Initializes the CommonCrawlerManager with a crawl ID. + Args: + crawl_id: the Common Crawl index to use + """ self.crawl_id = crawl_id CC_INDEX_SERVER = "http://index.commoncrawl.org/" INDEX_NAME = f"{self.crawl_id}-index" self.root_url = f"{CC_INDEX_SERVER}{INDEX_NAME}" def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResult: + """ + Crawls the Common Crawl index for a given search term and keyword. + Args: + search_term: the term to search for + keyword: the keyword to search for + start_page: the page to start the search from + num_pages: the number of pages to search + """ print( f"Searching for {keyword} on {search_term} in {self.crawl_id} for {num_pages} pages," f" starting at page {start_page}" @@ -136,4 +155,7 @@ def process_response( @staticmethod def get_urls_with_keyword(records: list[dict], keyword) -> list[str]: + """ + Returns a list of URLs that contain the given keyword + """ return [record["url"] for record in records if keyword in record["url"]] diff --git a/source_collectors/common_crawler/csv_manager.py b/source_collectors/common_crawler/csv_manager.py index 2b823b42..5a80aeaa 100644 --- a/source_collectors/common_crawler/csv_manager.py +++ b/source_collectors/common_crawler/csv_manager.py @@ -11,6 +11,12 @@ class CSVManager: """ def __init__(self, file_name: str, headers: list[str], directory=None): + """ + Args: + file_name: the name of the CSV file + headers: the headers for the CSV file + directory: the directory to store the CSV file + """ self.file_path = get_file_path(f"{file_name}.csv", directory) self.headers = headers if not os.path.exists(self.file_path): diff --git a/source_collectors/common_crawler/main.py b/source_collectors/common_crawler/main.py index b9dd012f..a83b0aee 100644 --- a/source_collectors/common_crawler/main.py +++ b/source_collectors/common_crawler/main.py @@ -28,6 +28,9 @@ @dataclasses.dataclass class BatchInfo: + """ + Dataclass for batch info + """ datetime: str source: str count: str @@ -46,12 +49,18 @@ class LabelStudioError(Exception): def get_current_time(): + """ + Returns the current time + """ return str(datetime.now()) def add_batch_info_to_csv( common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int ) -> BatchInfo: + """ + Adds batch info to CSV + """ batch_info = BatchInfo( datetime=get_current_time(), source="Common Crawl", @@ -70,6 +79,9 @@ def add_batch_info_to_csv( def main(): + """ + Main function + """ # Parse the arguments args = parse_args() @@ -307,6 +319,9 @@ def process_crawl_and_upload( huggingface_api_manager: HuggingFaceAPIManager, label_studio_data: list[dict], ) -> CommonCrawlResult: + """ + Processes a crawl and uploads the results to Hugging Face. + """ # Initialize the CommonCrawlerManager crawler_manager = CommonCrawlerManager(args.common_crawl_id) # Determine the pages to search, based on the last page searched diff --git a/source_collectors/common_crawler/utils.py b/source_collectors/common_crawler/utils.py index 3cea7af2..8023d50d 100644 --- a/source_collectors/common_crawler/utils.py +++ b/source_collectors/common_crawler/utils.py @@ -9,9 +9,15 @@ class URLWithParameters: """ def __init__(self, url): + """ + Initialize the URLWithParameters object with the given URL + """ self.url = url def add_parameter(self, parameter, value): + """ + Add a parameter to the URL + """ if "?" in self.url: self.url += f"&{parameter}={value}" else: @@ -19,4 +25,7 @@ def add_parameter(self, parameter, value): return self.url def __str__(self): + """ + Return the URL + """ return self.url diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py index 2ae99ee6..0abd527d 100644 --- a/source_collectors/muckrock/download_muckrock_foia.py +++ b/source_collectors/muckrock/download_muckrock_foia.py @@ -22,8 +22,10 @@ output_file = "foia_data.json" -# Function to fetch data from a specific page def fetch_page(page): + """ + Fetches data from a specific page of the MuckRock FOIA API. + """ response = requests.get( base_url, params={"page": page, "page_size": per_page, "format": "json"} ) diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index c9f188b3..a077dbc7 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -53,7 +53,9 @@ def get_agency(agency_id): - # API call to get agency_described + """ + Function to get agency_described + """ if agency_id: agency_url = f"https://www.muckrock.com/api_v1/agency/{agency_id}/" response = requests.get(agency_url) @@ -68,6 +70,9 @@ def get_agency(agency_id): def get_jurisdiction(jurisdiction_id): + """ + Function to get jurisdiction_described + """ if jurisdiction_id: jurisdiction_url = ( f"https://www.muckrock.com/api_v1/jurisdiction/{jurisdiction_id}/" diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py index da2baf74..a559f67f 100644 --- a/source_collectors/muckrock/get_allegheny_foias.py +++ b/source_collectors/muckrock/get_allegheny_foias.py @@ -7,8 +7,10 @@ import time -# Function to fetch jurisdiction IDs based on town names from a text file def fetch_jurisdiction_ids(town_file, base_url): + """ + fetch jurisdiction IDs based on town names from a text file + """ with open(town_file, "r") as file: town_names = [line.strip() for line in file] @@ -39,8 +41,10 @@ def fetch_jurisdiction_ids(town_file, base_url): return jurisdiction_ids -# Function to fetch FOIA data for each jurisdiction ID and save it to a JSON file def fetch_foia_data(jurisdiction_ids): + """ + fetch FOIA data for each jurisdiction ID and save it to a JSON file + """ all_data = [] for name, id_ in jurisdiction_ids.items(): url = f"https://www.muckrock.com/api_v1/foia/?status=done&jurisdiction={id_}" @@ -68,8 +72,10 @@ def fetch_foia_data(jurisdiction_ids): print(f"Saved {len(all_data)} records to foia_data_combined.json") -# Main function to execute the script def main(): + """ + Execute the script + """ town_file = "allegheny-county-towns.txt" jurisdiction_url = ( "https://www.muckrock.com/api_v1/jurisdiction/?level=l&parent=126" diff --git a/source_collectors/muckrock/search_local_foia_json.py b/source_collectors/muckrock/search_local_foia_json.py index 1e5702d9..562c4bae 100644 --- a/source_collectors/muckrock/search_local_foia_json.py +++ b/source_collectors/muckrock/search_local_foia_json.py @@ -19,8 +19,10 @@ matching_entries = [] -# Function to search within an entry def search_entry(entry): + """ + search within an entry + """ # Check if 'status' is 'done' if entry.get("status") != "done": return False From e7643d6663037f6ea2066bd5ef4a195493b678f4 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 12 Dec 2024 15:04:09 -0500 Subject: [PATCH 11/11] Add docstrings, add additional ignores to Flake8 --- .github/workflows/python_checks.yml | 2 +- source_collectors/ckan/search_terms.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python_checks.yml b/.github/workflows/python_checks.yml index 6a5c642d..7f5bef91 100644 --- a/.github/workflows/python_checks.yml +++ b/.github/workflows/python_checks.yml @@ -18,5 +18,5 @@ jobs: uses: reviewdog/action-flake8@v3 with: github_token: ${{ secrets.GITHUB_TOKEN }} - flake8_args: --ignore E501,W291,W293,D401,D400,E402,E302,D200,D202,D205,W503,E203 + flake8_args: --ignore E501,W291,W293,D401,D400,E402,E302,D200,D202,D205,W503,E203,D204,D403 level: warning diff --git a/source_collectors/ckan/search_terms.py b/source_collectors/ckan/search_terms.py index 179e58d8..716d68f7 100644 --- a/source_collectors/ckan/search_terms.py +++ b/source_collectors/ckan/search_terms.py @@ -1,3 +1,7 @@ +""" +CKAN search terms +""" + package_search = [ { "url": "https://catalog.data.gov/",