From 3519bf42f8948a7e15d52d2d08fac5dc59f9a703 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 17 Apr 2025 16:36:40 -0400 Subject: [PATCH] refactor(app): remove deprecated and unused code --- .github/workflows/common_crawler.yaml | 40 -- .github/workflows/populate_labelstudio.yml | 94 ----- ENV.md | 3 - agency_identifier/README.md | 40 -- agency_identifier/__init__.py | 0 agency_identifier/identifier.py | 234 ----------- api/routes/batch.py | 1 - collector_db/enums.py | 1 - collector_manager/AsyncCollectorBase.py | 11 +- core/SourceCollectorCore.py | 2 - core/TaskManager.py | 2 +- .../MuckrockAgencyIdentificationSubtask.py | 2 +- .../AgencyIdentificationTaskOperator.py | 2 +- source_collectors/ckan/README.md | 22 -- source_collectors/ckan/main.py | 44 --- source_collectors/ckan/requirements.txt | 6 - source_collectors/ckan/schemas.py | 6 - .../ckan/scrape_ckan_data_portals.py | 25 -- source_collectors/common_crawler/README.md | 87 ----- source_collectors/common_crawler/argparser.py | 95 ----- source_collectors/common_crawler/cache.py | 93 ----- source_collectors/common_crawler/config.ini | 19 - .../common_crawler/csv_manager.py | 79 ---- .../common_crawler/data/cache.json | 7 - .../common_crawler/data/urls.csv | 207 ---------- source_collectors/common_crawler/main.py | 366 ------------------ .../requirements_common_crawler_action.txt | 3 - source_collectors/common_crawler/schemas.py | 22 -- .../muckrock}/MuckrockAPIInterface.py | 0 source_collectors/muckrock/README.md | 82 ---- .../muckrock/classes/SQLiteClient.py | 38 -- source_collectors/muckrock/muck_get.py | 16 - source_collectors/muckrock/requirements.txt | 30 -- .../test_muckrock_api_interface.py | 2 +- tests/manual/unsorted/test_identifier_unit.py | 275 ------------- .../integration/api/conftest.py | 2 - .../tasks/test_agency_preannotation_task.py | 2 +- 37 files changed, 10 insertions(+), 1950 deletions(-) delete mode 100644 .github/workflows/common_crawler.yaml delete mode 100644 .github/workflows/populate_labelstudio.yml delete mode 100644 agency_identifier/README.md delete mode 100644 agency_identifier/__init__.py delete mode 100644 agency_identifier/identifier.py delete mode 100644 source_collectors/ckan/main.py delete mode 100644 source_collectors/ckan/requirements.txt delete mode 100644 source_collectors/ckan/schemas.py delete mode 100644 source_collectors/common_crawler/README.md delete mode 100644 source_collectors/common_crawler/argparser.py delete mode 100644 source_collectors/common_crawler/cache.py delete mode 100644 source_collectors/common_crawler/config.ini delete mode 100644 source_collectors/common_crawler/csv_manager.py delete mode 100644 source_collectors/common_crawler/data/cache.json delete mode 100644 source_collectors/common_crawler/data/urls.csv delete mode 100644 source_collectors/common_crawler/main.py delete mode 100644 source_collectors/common_crawler/requirements_common_crawler_action.txt delete mode 100644 source_collectors/common_crawler/schemas.py rename {agency_identifier => source_collectors/muckrock}/MuckrockAPIInterface.py (100%) delete mode 100644 source_collectors/muckrock/classes/SQLiteClient.py delete mode 100644 source_collectors/muckrock/muck_get.py delete mode 100644 source_collectors/muckrock/requirements.txt delete mode 100644 tests/manual/unsorted/test_identifier_unit.py diff --git a/.github/workflows/common_crawler.yaml b/.github/workflows/common_crawler.yaml deleted file mode 100644 index 52b4007d..00000000 --- a/.github/workflows/common_crawler.yaml +++ /dev/null @@ -1,40 +0,0 @@ -name: Common Crawler - -# Pull request will run every day at 1AM. -on: - workflow_dispatch: -env: - # The access token enabling write access to the Huggingface Database - HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }} - -jobs: - build-and-commit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - # This is necessary to push commits back to the repository - persist-credentials: true - fetch-depth: 0 # Fetch all history for all tags and branches - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: 3.11.8 - - name: Upgrade pip - run: python -m pip install --upgrade pip - - name: Install dependencies - run: pip install -r source_collectors/common_crawler/requirements_common_crawler_action.txt - - name: Run script - run: python source_collectors/common_crawler/main.py CC-MAIN-2024-10 *.gov police --config source_collectors/common_crawler/config.ini --pages 20 - - name: Configure Git - run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - - name: Add common_crawler cache and common_crawler batch_info - run: | - git add source_collectors/common_crawler/data/cache.json - git add source_collectors/common_crawler/data/batch_info.csv - - name: Commit changes - run: git commit -m "Update common_crawler cache and batch_info" - - name: Push changes - run: git push \ No newline at end of file diff --git a/.github/workflows/populate_labelstudio.yml b/.github/workflows/populate_labelstudio.yml deleted file mode 100644 index 09ca68b2..00000000 --- a/.github/workflows/populate_labelstudio.yml +++ /dev/null @@ -1,94 +0,0 @@ -name: Populate LabelStudio - -on: - workflow_dispatch: - inputs: - crawl_id: - description: 'Common Crawl Corpus' - required: true - default: 'CC-MAIN-2024-10' - url: - description: 'URL type' - required: true - default: '*.gov' - keyword: - description: 'keyword' - required: true - default: 'police' - pages: - description: 'num pages' - required: true - default: '2' - record_type: - description: 'record type' - required: false - - -jobs: - run-script: - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - ref: main - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r annotation_pipeline/requirements.txt - - - name: Run main script - env: - HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }} - LABEL_STUDIO_ACCESS_TOKEN: ${{ secrets.LABEL_STUDIO_ACCESS_TOKEN }} - LABEL_STUDIO_PROJECT_ID: ${{ secrets.LABEL_STUDIO_PROJECT_ID }} - LABEL_STUDIO_ORGANIZATION: ${{ secrets.LABEL_STUDIO_ORGANIZATION }} - run: | - if [ -n "${{ github.event.inputs.record_type }}" ]; then - python annotation_pipeline/populate_labelstudio.py ${{ github.event.inputs.crawl_id }} "${{ github.event.inputs.url }}" ${{ github.event.inputs.keyword }} --pages ${{ github.event.inputs.pages }} --record_type "${{ github.event.inputs.record_type }}" - else - python annotation_pipeline/populate_labelstudio.py ${{ github.event.inputs.crawl_id }} "${{ github.event.inputs.url }}" ${{ github.event.inputs.keyword }} --pages ${{ github.event.inputs.pages }} - fi - - - name: Check created/modified files - run: | - echo "Checking files in annotation_pipeline/data/" - ls -R annotation_pipeline/data/ - - - name: Create new branch - run: | - BRANCH_NAME=bot-update-$(date +%Y%m%d%H%M%S) - git checkout -b $BRANCH_NAME - echo "BRANCH_NAME=$BRANCH_NAME" >> $GITHUB_ENV - - - name: Commit and push outputs - run: | - git config --global user.name "github-actions[bot]" - git config --global user.email "action@github.com" - git add annotation_pipeline/data/batch_info.csv - git add annotation_pipeline/data/cache.json - if [ -d "annotation_pipeline/data/tag_collector" ]; then - git add annotation_pipeline/data/tag_collector/* - fi - git commit -m "Update batch info, cache, and collected urls & tags" - git log -1 --stat - git push --set-upstream origin $BRANCH_NAME - - - name: Create pull request - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - BRANCH_NAME: ${{ env.BRANCH_NAME }} - run: | - PR_TITLE="Update batch info, cache, and collected urls & tags" - PR_BODY="This PR was created automatically by a GitHub Action." - echo "Creating PR from branch $BRANCH_NAME to main" - curl -X POST -H "Authorization: token $GITHUB_TOKEN" \ - -d "{\"title\":\"$PR_TITLE\",\"body\":\"$PR_BODY\",\"head\":\"$BRANCH_NAME\",\"base\":\"main\"}" \ - https://api.github.com/repos/${{ github.repository }}/pulls diff --git a/ENV.md b/ENV.md index 7c09fb64..5292320b 100644 --- a/ENV.md +++ b/ENV.md @@ -4,9 +4,6 @@ Please ensure these are properly defined in a `.env` file in the root directory. | Name | Description | Example | |----------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------| -| `LABEL_STUDIO_ACCESS_TOKEN` | The access token for the Label Studio API. The access token for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the [user account section](https://app.heartex.com/user/account), where the access token can be copied. | `abc123` | -| `LABEL_STUDIO_PROJECT_ID` | The project ID for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the relevant project, where the project id will be in the URL, as in `https://app.heartex.com/projects/58475/` | `58475` | -| `LABEL_STUDIO_ORGANIZATION_ID` | The organization ID for the Label Studio API. This can be obtained by logging into Label Studio and navigating to the [Organization section](https://app.heartex.com/organization?page=1), where the organization ID can be copied. | `6758` | | `GOOGLE_API_KEY` | The API key required for accessing the Google Custom Search API | `abc123` | | `GOOGLE_CSE_ID` | The CSE ID required for accessing the Google Custom Search API | `abc123` | |`POSTGRES_USER` | The username for the test database | `test_source_collector_user` | diff --git a/agency_identifier/README.md b/agency_identifier/README.md deleted file mode 100644 index c1dadcf2..00000000 --- a/agency_identifier/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Agency Identifier - -The Agency Identifier is a Python application that matches URLs with an agency from the PDAP database. It takes a list of URLs as input, either from a CSV file or a DataFrame, and returns a DataFrame with the matched agencies. - -## How to use - -### Running from the command line - -1. Clone the repository. -2. Create a CSV file containing a list of URLs to be identified. The URLs should be listed one per line, and the file should have at least a "url" column. -3. Run the command `python3 identifier.py [url_file]`, replacing `[url_file]` with the path to your CSV file. -4. The results will be written to a file named `results.csv` in the same directory. - -### Using the "identifier_main" function - -If you're using the Agency Identifier in your own Python code, you can import the `process_and_write_data` function. This function takes a DataFrame as an argument and returns a DataFrame with the matched agencies. - -Here's an example of how to use it: - -```python -import polar as pl -from identifier import process_and_write_data - -# Create a DataFrame with the URLs to be identified -df = pl.DataFrame({"url": ["http://agency1.com/page1", "http://agency2.com/page2"]}) - -# Call the identifier_main function -result = process_and_write_data(df) - -# Print the resulting DataFrame -print(result) -``` - -# Requirements - -- Python 3 -- urllib -- re -- polars -- requests \ No newline at end of file diff --git a/agency_identifier/__init__.py b/agency_identifier/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/agency_identifier/identifier.py b/agency_identifier/identifier.py deleted file mode 100644 index 786aeba6..00000000 --- a/agency_identifier/identifier.py +++ /dev/null @@ -1,234 +0,0 @@ -import os -import re -import sys -from urllib.parse import urlparse - -import polars -import requests - -API_URL = "https://data-sources.pdap.io/api/agencies/" - - -def get_page_data(page: int) -> dict: - """Fetches a page of data from the API. - - Args: - page (int): The page number to fetch. - - Returns: - dict: The data for the page. - """ - api_key = "Bearer " + os.getenv("VUE_APP_PDAP_API_KEY") - response = requests.get(f"{API_URL}{page}", headers={"Authorization": api_key}) - if response.status_code != 200: - raise Exception("Request to PDAP API failed. Response code:", response.status_code) - return response.json()["data"] - - -def get_agencies_data() -> polars.DataFrame: - """Retrives a list of agency dictionaries from file. - - Returns: - list: List of agency dictionaries. - """ - page = 1 - agencies_df = polars.DataFrame() - results = get_page_data(page) - - while results: - # Use list comprehension to clean results - clean_results = clean_page_data_results(results) - new_agencies_df = polars.DataFrame(clean_results) - if not new_agencies_df.is_empty(): - agencies_df = polars.concat([agencies_df, new_agencies_df]) - page += 1 - results = get_page_data(page) - - return agencies_df - - -def clean_page_data_results(results: list[dict[str, str]]) -> list[dict[str, str]]: - clean_results = [] - for result in results: - clean_result = {} - for k, v in result.items(): - if v is None: - clean_result[k] = "" - else: - clean_result[k] = v - clean_results.append(clean_result) - return clean_results - - -def parse_hostname(url: str) -> str: - """Retrieves the hostname (example.com) from a url string. - - Args: - url (str): Url to parse. - - Returns: - str: The url's hostname. - """ - try: - # Remove leading and trailing whitespaces and quotes - url = url.strip().strip('"') - - # Add "http://" to the url if it's not present - if not re.match(r'http(s)?://', url): - url = "http://" + url - - # Parse the url and retrieve the hostname - parsed_url = urlparse(url) - hostname = parsed_url.hostname - - # Remove "www." from the hostname - hostname = re.sub(r'^www\.', '', hostname) - except Exception as e: - print(f"An error occurred while parsing the URL: {e}") - raise e - return hostname - - -def remove_http(url: str) -> str: - """Removes http(s)://www. from a given url so that different protocols don't throw off the matcher. - - Args: - url (str): Url to remove http from. - - Returns: - str: The url without http(s)://www. - """ - try: - # Remove http(s)://www. and www. prefixes from the url - url = re.sub(r'^(http(s)?://)?(www\.)?', '', url) - # Ensure the url ends with a / - if not url.endswith('/'): - url += '/' - except Exception as e: - print(f"An error occurred while processing the URL: {e}") - raise e - return url - - -def match_agencies(agencies, agency_hostnames, url): - """Attempts to match a url with an agency. - - Args: - agencies (list): List of agency dictionaries. - agency_hostnames (list): List of corresponding agency hostnames. - url (str): Url to match. - - Returns: - dict: Dictionary of a match in the form {"url": url, "agency": matched_agency}. - """ - url = url.strip().strip('"') - url_hostname = parse_hostname(url) - - if url_hostname in agency_hostnames: - # All agencies with the same hostname as the url are found - matched_agency = [ - agencies[i] for i, agency_hostname in enumerate(agency_hostnames) if url_hostname == agency_hostname - ] - else: - return {"url": url, "agency": [], "status": "No match found"} - - # More than one agency was found - if len(matched_agency) > 1: - url_no_http = remove_http(url) - - for agency in matched_agency: - agency_homepage = remove_http(agency["homepage_url"]) - # It is assumed that if the url begins with the agency's url, then it belongs to that agency - if url_no_http.startswith(agency_homepage): - return {"url": url, "agency": agency, "status": "Match found"} - break - - return {"url": url, "agency": [], "status": "Contested match"} - - return {"url": url, "agency": matched_agency[0], "status": "Match found"} - - -def match_urls_to_agencies_and_clean_data(urls_df: polars.DataFrame) -> polars.DataFrame: - agencies_df = get_agencies_data() - # Filter out agencies without a homepage_url set - # Define column names as variables for flexibility - homepage_url_col = "homepage_url" - hostname_col = "hostname" - count_data_sources_col = "count_data_sources" - max_data_sources_col = "max_data_sources" - - # Perform operations on DataFrame - try: - agencies_df = ( - agencies_df - # Filter out rows without a homepage_url - .filter(polars.col(homepage_url_col).is_not_null()) - .filter(polars.col(homepage_url_col) != "") - # Add a new column 'hostname' by applying the parse_hostname function to 'homepage_url' - .with_columns(polars.col(homepage_url_col).map_elements(parse_hostname).alias(hostname_col), - polars.col(count_data_sources_col).fill_null(0)) - # Add a new column 'max_data_sources' which is the max of 'count_data_sources' over 'hostname' - .with_columns(polars.col(count_data_sources_col).max().over(hostname_col).alias(max_data_sources_col)) - # Filter rows where 'count_data_sources' equals 'max_data_sources' - .filter(polars.col(count_data_sources_col) == polars.col(max_data_sources_col)) - # Keep only unique rows based on 'homepage_url' - .unique(subset=[homepage_url_col]) - ) - print("Indentifying agencies...") - # Add a new column 'hostname' by applying the parse_hostname function to 'url' - urls_df = urls_df.with_columns(polars.col("url").map_elements(parse_hostname).alias("hostname")) - - # Join urls_df with agencies_df on 'hostname' - matched_agencies_df = urls_df.join(agencies_df, on="hostname", how="left") - - # Replace all null values with an empty string - matched_agencies_clean_df = matched_agencies_df.with_columns(polars.all().fill_null("")) - except Exception as e: - print(f"An error occurred while processing the data: {e}") - raise e - return matched_agencies_clean_df - - -def read_data(file_path: str) -> polars.DataFrame: - try: - return polars.read_csv(file_path) - except Exception as e: - print(f"An error occurred while reading the file: {e}") - raise e - - -def write_data(df: polars.DataFrame, file_path: str): - try: - df.write_csv(file_path) - print("Results written to results.csv") - except Exception as e: - print(f"An error occurred while writing to the file: {e}") - raise e - - -def process_data(urls_df: polars.DataFrame) -> polars.DataFrame: - matched_agencies_df = match_urls_to_agencies_and_clean_data(urls_df) - - # Filter out rows where the hostname is not null - matches_only = matched_agencies_df.filter(polars.col("hostname").is_not_null()) - num_matches = len(matches_only) - num_urls = len(urls_df) - percent_urls_matched = 100 * float(num_matches) / float(num_urls) - - # Print the number and percentage of URLs that were matched - print(f"\n{num_matches} / {num_urls} ({percent_urls_matched:0.1f}%) of urls identified") - - # Return the DataFrame containing only the matched URLs - return matches_only - - -def process_and_write_data(input_file: str, output_file: str): - urls_df = read_data(input_file) - matches_only = process_data(urls_df) - if not matches_only.is_empty(): - write_data(matches_only, output_file) - - -if __name__ == "__main__": - process_and_write_data(sys.argv[1], "results.csv") - print("Results written to results.csv") diff --git a/api/routes/batch.py b/api/routes/batch.py index 2c791503..7ba0a2a4 100644 --- a/api/routes/batch.py +++ b/api/routes/batch.py @@ -103,7 +103,6 @@ async def get_batch_logs( @batch_router.post("/{batch_id}/abort") async def abort_batch( batch_id: int = Path(description="The batch id"), - core: SourceCollectorCore = Depends(get_core), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), ) -> MessageResponse: diff --git a/collector_db/enums.py b/collector_db/enums.py index a701a847..b28b6091 100644 --- a/collector_db/enums.py +++ b/collector_db/enums.py @@ -17,7 +17,6 @@ class ValidationStatus(PyEnum): class ValidationSource(PyEnum): MACHINE_LEARNING = "Machine Learning" - LABEL_STUDIO = "Label Studio" MANUAL = "Manual" diff --git a/collector_manager/AsyncCollectorBase.py b/collector_manager/AsyncCollectorBase.py index 099f5338..a842a9c0 100644 --- a/collector_manager/AsyncCollectorBase.py +++ b/collector_manager/AsyncCollectorBase.py @@ -72,17 +72,17 @@ async def handle_error(self, e: Exception) -> None: ) async def process(self) -> None: - await self.log("Processing collector...", allow_abort=False) + await self.log("Processing collector...") preprocessor = self.preprocessor() url_infos = preprocessor.preprocess(self.data) - await self.log(f"URLs processed: {len(url_infos)}", allow_abort=False) + await self.log(f"URLs processed: {len(url_infos)}") - await self.log("Inserting URLs...", allow_abort=False) + await self.log("Inserting URLs...") insert_urls_info: InsertURLsInfo = await self.adb_client.insert_urls( url_infos=url_infos, batch_id=self.batch_id ) - await self.log("Updating batch...", allow_abort=False) + await self.log("Updating batch...") await self.adb_client.update_batch_post_collection( batch_id=self.batch_id, total_url_count=insert_urls_info.total_count, @@ -91,7 +91,7 @@ async def process(self) -> None: batch_status=self.status, compute_time=self.compute_time ) - await self.log("Done processing collector.", allow_abort=False) + await self.log("Done processing collector.") if self.post_collection_function_trigger is not None: await self.post_collection_function_trigger.trigger_or_rerun() @@ -123,7 +123,6 @@ async def run(self) -> None: async def log( self, message: str, - allow_abort = True # Deprecated ) -> None: await self.logger.log(LogInfo( batch_id=self.batch_id, diff --git a/core/SourceCollectorCore.py b/core/SourceCollectorCore.py index 4516ceb5..6f05a3c4 100644 --- a/core/SourceCollectorCore.py +++ b/core/SourceCollectorCore.py @@ -9,8 +9,6 @@ class SourceCollectorCore: def __init__( self, - core_logger: Optional[Any] = None, # Deprecated - collector_manager: Optional[Any] = None, # Deprecated db_client: Optional[DatabaseClient] = None, dev_mode: bool = False ): diff --git a/core/TaskManager.py b/core/TaskManager.py index 429375c2..e72724fc 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -1,6 +1,6 @@ import logging -from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface +from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.enums import TaskType diff --git a/core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py b/core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py index 03f2a064..a6222cf8 100644 --- a/core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py +++ b/core/classes/subtasks/MuckrockAgencyIdentificationSubtask.py @@ -1,6 +1,6 @@ from typing import Optional -from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponse, AgencyLookupResponseType +from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponse, AgencyLookupResponseType from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.exceptions import MuckrockAPIError from core.helpers import process_match_agency_response_to_suggestions diff --git a/core/classes/task_operators/AgencyIdentificationTaskOperator.py b/core/classes/task_operators/AgencyIdentificationTaskOperator.py index 4c2d6f1b..b6e53955 100644 --- a/core/classes/task_operators/AgencyIdentificationTaskOperator.py +++ b/core/classes/task_operators/AgencyIdentificationTaskOperator.py @@ -1,6 +1,6 @@ from aiohttp import ClientSession -from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface +from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo from collector_db.enums import TaskType diff --git a/source_collectors/ckan/README.md b/source_collectors/ckan/README.md index be6c65cf..2afcbb28 100644 --- a/source_collectors/ckan/README.md +++ b/source_collectors/ckan/README.md @@ -19,28 +19,6 @@ Running the scraper will output a list of packages to a CSV file using the searc * `search_terms.py` - The search terms and CKAN portals to search from. * `ckan_scraper_toolkit.py` - Toolkit of functions that use ckanapi to retrieve packages from CKAN data portals. -## Setup - -1. In a terminal, navigate to the CKAN scraper folder - ```cmd - cd scrapers_library/data_portals/ckan/ - ``` -2. Create and activate a Python virtual environment - ```cmd - python -m venv venv - source venv/bin/activate - ``` - -3. Install the requirements - ```cmd - pip install -r requirements.txt - ``` -4. Run the multi-portal CKAN scraper - ```cmd - python scrape_ckan_data_portals.py - ``` -5. Review the generated `results.csv` file. - ## How can I tell if a website I want to scrape is hosted using CKAN? There's no easy way to tell, some websites will reference CKAN or link back to the CKAN documentation while others will not. There doesn't seem to be a database of all CKAN instances either. diff --git a/source_collectors/ckan/main.py b/source_collectors/ckan/main.py deleted file mode 100644 index 091d2642..00000000 --- a/source_collectors/ckan/main.py +++ /dev/null @@ -1,44 +0,0 @@ -from source_collectors.ckan.ckan_scraper_toolkit import ckan_package_search, ckan_group_package_show, \ - ckan_package_search_from_organization -from source_collectors.ckan.scrape_ckan_data_portals import perform_search, get_flat_list, deduplicate_entries, \ - get_collection_child_packages, filter_result, parse_result, write_to_csv -from source_collectors.ckan.search_terms import package_search, group_search, organization_search - - - -async def main(): - """ - Main function. - """ - results = [] - - print("Gathering results...") - results = await perform_search( - search_func=ckan_package_search, - search_terms=package_search, - results=results, - ) - results = await perform_search( - search_func=ckan_group_package_show, - search_terms=group_search, - results=results, - ) - results = await perform_search( - search_func=ckan_package_search_from_organization, - search_terms=organization_search, - results=results, - ) - - flat_list = get_flat_list(results) - # Deduplicate entries - flat_list = deduplicate_entries(flat_list) - print("\nRetrieving collections...") - flat_list = get_collection_child_packages(flat_list) - - filtered_results = list(filter(filter_result, flat_list)) - parsed_results = list(map(parse_result, filtered_results)) - - write_to_csv(parsed_results) - -if __name__ == "__main__": - main() diff --git a/source_collectors/ckan/requirements.txt b/source_collectors/ckan/requirements.txt deleted file mode 100644 index fc41154b..00000000 --- a/source_collectors/ckan/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -from_root -ckanapi -bs4 -lxml -tqdm -pandas \ No newline at end of file diff --git a/source_collectors/ckan/schemas.py b/source_collectors/ckan/schemas.py deleted file mode 100644 index 6aeecf09..00000000 --- a/source_collectors/ckan/schemas.py +++ /dev/null @@ -1,6 +0,0 @@ -from marshmallow import Schema, fields - - -class PackageSearchSchema(Schema): - count = fields.Int(required=True) - results = fields.List(fields.Str(), required=True) # TODO: What is the structure of this? \ No newline at end of file diff --git a/source_collectors/ckan/scrape_ckan_data_portals.py b/source_collectors/ckan/scrape_ckan_data_portals.py index ad3d62e2..3a292b02 100644 --- a/source_collectors/ckan/scrape_ckan_data_portals.py +++ b/source_collectors/ckan/scrape_ckan_data_portals.py @@ -4,7 +4,6 @@ from itertools import chain from typing import Any, Callable, Optional -import pandas as pd from from_root import from_root from tqdm import tqdm @@ -41,26 +40,6 @@ async def perform_search( return results -async def get_collection_child_packages( - results: list[dict[str, Any]] -) -> list[dict[str, Any]]: - """Retrieves the child packages of each collection. - - :param results: List of results. - :return: List of results containing child packages. - """ - new_list = [] - - for result in tqdm(results): - if "extras" in result.keys(): - collections = await get_collections(result) - if collections: - new_list += collections[0] - continue - - new_list.append(result) - - return new_list async def get_collections(result): @@ -265,7 +244,3 @@ def deduplicate_entries(flat_list): return flat_list -def write_to_csv(parsed_results): - df = pd.DataFrame(parsed_results) - df.to_csv("results.csv") - diff --git a/source_collectors/common_crawler/README.md b/source_collectors/common_crawler/README.md deleted file mode 100644 index 3701b5d5..00000000 --- a/source_collectors/common_crawler/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# Common Crawler - -This module interfaces with the Common Crawl dataset to extract urls. - -## Installation - -Python Version Required: 3.11 - -To install all necessary dependencies, run the following command from the root directory: - -```bash -pip install -r requirements.txt -``` - - -## Usage Example - -### Environment Requirements - -Please ensure you have a `.env` file located in the root directory (not the `common_crawler` directory) -which contains the following environment variable: - -* HUGGINGFACE_ACCESS_TOKEN = The access token to enable writing to the associated PDAP dataset. -To obtain your access token, consult user settings at -and ensure you have write access to . -* LABEL_STUDIO_ACCESS_TOKEN = The access token for the Label Studio API. This can be - obtained by logging into Label Studio and navigating to the [user account section](https://app.heartex.com/user/account), where the access token can be copied. -* LABEL_STUDIO_PROJECT_ID = The project ID for the Label Studio API. This can be - obtained by logging into Label Studio and navigating to the relevant project, where the project id will be in the URL. - -### Instructions - -Run the following script from the root directory -```bash -python common_crawler/main.py CC-MAIN-2023-50 '*.gov' police --config common_crawler/config.ini --pages 2 -``` - -This example will crawl a single page (typically 15000 records) of the Common Crawl dataset with ID `CC-MAIN-2023-50` -and search for the term `police` in all the pages with the `.gov` domain. It will use the default configuration file `config.ini` -to determine the json cache location and the location of the output csv file. - -Note that the cache records the most recent page number that was used for given combination of Common Crawl ID, url search term, and keyword. -If the same command is run again, it will start from the next page. -If you want to reset the cache, you can use the `--reset-cache` flag. - -By default, the output csv file will be named `urls.csv` and will be located in the `data` directory of the module. -This csv file contains both the url and the parameters used to query it. - -### Parameters - -- **common_crawl_id**: Required. Specifies the Common Crawl Index to perform the search on. -- **url**: Required. Specifies the domain URL to query. Wildcard characters such as * can be used to expand the search. Note that the query must be contained within quotes (as in '*.gov') to prevent misinterpretation of wildcards -- **search_term**: Required. Specifies keyword within the url to search for. -- **-c or --config**: Optional. Specifies the configuration file to use. The default value is config.ini. -- **-p or --pages**: Optional. Specifies the number of pages to search. The default value is 1. -- **--reset-cache**: Optional. If set, it resets the cache before starting the crawl. - -### Configuration - -Several attributes are currently defined in `config.ini`: -- **cache_filename**: This is the name of the cache file. The default value is `cache`. The file will be saved with a `.json` extension. -- **output_filename**: This is the name of the output file. The default value is `urls`. The file will be saved with a `.csv` extension. -- **data_dir**: This is the directory where the cache and output files will be saved. The default value is `data`. -- **huggingface_repo_id**: This is the repository ID for the hugging face dataset which urls will be uploaded to - -## Code Structure - -The code is structured as follows: -- **main.py**: This is the main file that is used to run the module. It contains the logic to parse the command line arguments and call the necessary functions. -- **crawler.py**: This file contains the logic to interface with the Common Crawl dataset and extract urls. -- **cache.py**: This file contains the logic to read and write the cache file. -- **argparser.py**: This file contains the logic to parse the command line and config arguments. -- **csv_manager.py**: This file contains the logic to write the output csv file. -- **utils.py**: This file contains utility functions. -- **config.ini**: This file contains the default configuration values. -- **README.md**: This file contains the documentation for the module. You're reading it right now. Isn't that nifty! - -## Testing - -A suite of unit and integration tests were developed for this module. - -To run the tests, run the following command from this directory: - -```bash -pytest ../tests/test_common_crawler_integration.py -pytest ../tests/test_common_crawler_unit.py -``` \ No newline at end of file diff --git a/source_collectors/common_crawler/argparser.py b/source_collectors/common_crawler/argparser.py deleted file mode 100644 index 67f4a290..00000000 --- a/source_collectors/common_crawler/argparser.py +++ /dev/null @@ -1,95 +0,0 @@ -import argparse -import configparser -import re - -""" -This module contains the argument parser for command line arguments -for the Common Crawler script. -""" - - -def valid_common_crawl_id(common_crawl_id: str) -> bool: - """ - Validate the Common Crawl ID format. - The Common Crawl ID should be in the format CC-MAIN-YYYY-WW. - Args: - common_crawl_id: The Common Crawl ID to validate - Returns: - True if the Common Crawl ID is valid, False otherwise - """ - return re.match(r"CC-MAIN-\d{4}-\d{2}", common_crawl_id) is not None - - -def parse_args() -> argparse.Namespace: - """ - Parse the command line arguments for the Common Crawler script - as well as the configuration file. - Arguments parsed include: - - The Common Crawl ID - - The URL to query - - The search term - - The number of pages to search - - The configuration file (defaults to config.ini) - - A flag to reset the cache - Returns: The parsed arguments - """ - - parser = argparse.ArgumentParser( - description="Query the Common Crawl dataset and optionally save the results to a file." - ) - # Add the required arguments - parser.add_argument("common_crawl_id", type=str, help="The Common Crawl ID") - parser.add_argument("url", type=str, help="The URL to query") - parser.add_argument("keyword", type=str, help="The keyword to search in the url") - # Optional arguments for the number of pages and the output file, and a flag to reset the cache - parser.add_argument( - "-c", - "--config", - type=str, - default="config.ini", - help="The configuration file to use", - ) - parser.add_argument( - "-p", - "--pages", - type=int, - default=1, - help="The number of pages to search (default: 1)", - ) - parser.add_argument( - "--reset-cache", - action="store_true", - default=False, - help="Reset the cache before starting the crawl", - ) - - args = parser.parse_args() - - # Validate the Common Crawl ID format - if not valid_common_crawl_id(args.common_crawl_id): - parser.error( - "Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW." - ) - - # Read the configuration file - config = configparser.ConfigParser() - config.read(args.config) - - # Combine parsed arguments with configuration file defaults - app_parser = argparse.ArgumentParser(parents=[parser], add_help=False) - app_parser.set_defaults(**config["DEFAULT"]) - - app_args = app_parser.parse_args() - - # Print arguments - print(f"--Common Crawl ID: {app_args.common_crawl_id}") - print(f"--URL: {app_args.url}") - print(f"--Keyword: {app_args.keyword}") - print(f"--Number of Pages: {app_args.pages}") - print(f"--Configuration File: {app_args.config}") - print(f"--Reset Cache: {app_args.reset_cache}") - print(f"--Output File: {app_args.output_filename}.csv") - print(f"--Cache File: {app_args.cache_filename}.json") - print(f"--Data Directory: {app_args.data_dir}") - - return app_args diff --git a/source_collectors/common_crawler/cache.py b/source_collectors/common_crawler/cache.py deleted file mode 100644 index 23d58819..00000000 --- a/source_collectors/common_crawler/cache.py +++ /dev/null @@ -1,93 +0,0 @@ -import json - -from util.miscellaneous_functions import get_file_path - -""" -This module contains classes for managing a cache of Common Crawl search results -These classes include: - - CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results -""" - - -class CommonCrawlerCacheManager: - """ - A class for managing the cache of Common Crawl search results. - This class is responsible for adding, retrieving, and saving cache data. - """ - - def __init__(self, file_name: str = "cache", directory=None): - """ - Initializes the CacheStorage object with a file name and directory. - Args: - file_name: the name of the cache file - directory: the directory to store the cache file - """ - self.file_path = get_file_path(f"{file_name}.json", directory) - print(f"Cache file path: {self.file_path}") - self.cache = self.load_or_create_cache() - - def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None: - """ - Updates the cache with the last page crawled for a given index, url, and keyword. - Or adds a new cache object if it does not exist. - Args: - index: the index of the common crawl - url: the url to search - keyword: the search term to use - last_page: the last page crawled - Returns: None - """ - if index not in self.cache: - self.cache[index] = {} - if url not in self.cache[index]: - self.cache[index][url] = {} - self.cache[index][url][keyword] = last_page - - def get(self, index, url, keyword) -> int: - """ - Retrieves a page number from the cache. - Args: - index: the index of the common crawl - url: the url to search - keyword: the search term to use - - Returns: int - the last page crawled - - """ - if ( - index in self.cache - and url in self.cache[index] - and keyword in self.cache[index][url] - ): - return self.cache[index][url][keyword] - # The cache object does not exist. Return 0 as the default value. - return 0 - - def load_or_create_cache(self) -> dict: - """ - Loads the cache from the configured file path. - If the file does not exist, an empty dictionary is returned. - Returns: dict - the cache data - """ - try: - with open(self.file_path, "r") as file: - return json.load(file) - except FileNotFoundError: - return {} - - def save_cache(self) -> None: - """ - Converts the cache object into a JSON-serializable format and saves it to the configured file path. - This method ensures the cache is stored in a readable and easily reloadable format, allowing for - persistence of crawl data across sessions. - """ - # Reformat cache data for JSON serialization - with open(self.file_path, "w") as file: - json.dump(self.cache, file, indent=4) - - def reset_cache(self) -> None: - """ - Resets the cache to an empty state. - """ - self.cache = {} - print("Cache has been reset.") diff --git a/source_collectors/common_crawler/config.ini b/source_collectors/common_crawler/config.ini deleted file mode 100644 index fc558303..00000000 --- a/source_collectors/common_crawler/config.ini +++ /dev/null @@ -1,19 +0,0 @@ -# This configuration file contains default settings for the Common Crawler application. -# Settings can be modified to suit different environments or testing needs. - -[DEFAULT] -# Filename for the cache. Stores which pages have been crawled -# at which combinations of index, url search term, and keyword -# to avoid re-crawling them. -cache_filename = cache - -# Directory where data files (both cache and output) are stored. -# Change as needed for different environments. -# Path is relative from working directory that executes common_crawler/main.py -data_dir = common_crawler/data - -# Filename for the output CSV containing crawled URLs. -output_filename = urls - -# Name of the huggingface repo -huggingface_repo_id = PDAP/unlabeled-urls \ No newline at end of file diff --git a/source_collectors/common_crawler/csv_manager.py b/source_collectors/common_crawler/csv_manager.py deleted file mode 100644 index 5a80aeaa..00000000 --- a/source_collectors/common_crawler/csv_manager.py +++ /dev/null @@ -1,79 +0,0 @@ -import csv -import os - -from util.miscellaneous_functions import get_file_path - - -class CSVManager: - """ - Manages a CSV file for storing URLs. - Creates the file if it doesn't exist, and provides a method for adding new rows. - """ - - def __init__(self, file_name: str, headers: list[str], directory=None): - """ - Args: - file_name: the name of the CSV file - headers: the headers for the CSV file - directory: the directory to store the CSV file - """ - self.file_path = get_file_path(f"{file_name}.csv", directory) - self.headers = headers - if not os.path.exists(self.file_path): - self.initialize_file() - - def add_row(self, row_values: list[str] | tuple[str]): - """ - Appends a new row of data to the CSV. - Args: - row_values: list of values to add to the csv, in order of their inclusion in the list - """ - if isinstance(row_values, str): - # Single values must be converted to a list format - row_values = [row_values] - try: - with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: - writer = csv.writer(file) - writer.writerow(row_values) - except Exception as e: - print(f"An error occurred while trying to write to {self.file_path}: {e}") - - def add_rows(self, results: list[list[str]]) -> None: - """ - Appends multiple rows of data to the CSV as a list of lists of strings. - Args: - results: list[list[str] - a list of lists of strings, each inner list representing a row - Returns: None - """ - for result in results: - self.add_row(result) - print(f"{len(results)} URLs written to {self.file_path}") - - def initialize_file(self): - """ - Initializes the CSV file. - If the file doesn't exist, it creates it with the header row. - """ - # check if file exists - file_exists = os.path.isfile(self.file_path) - - if not file_exists: - with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: - writer = csv.writer(file) - writer.writerow(self.headers) - else: - # Open and check that headers match - with open(self.file_path, mode="r", encoding="utf-8") as file: - header_row = next(csv.reader(file)) - if header_row != self.headers: - raise ValueError( - f"Header row in {self.file_path} does not match expected headers" - ) - print(f"CSV file initialized at {self.file_path}") - - def delete_file(self): - """ - Deletes the CSV file. - """ - os.remove(self.file_path) - print(f"CSV file deleted at {self.file_path}") diff --git a/source_collectors/common_crawler/data/cache.json b/source_collectors/common_crawler/data/cache.json deleted file mode 100644 index e12687ad..00000000 --- a/source_collectors/common_crawler/data/cache.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "CC-MAIN-2023-50": { - "*.gov": { - "police": 10 - } - } -} \ No newline at end of file diff --git a/source_collectors/common_crawler/data/urls.csv b/source_collectors/common_crawler/data/urls.csv deleted file mode 100644 index 6fc4dc6f..00000000 --- a/source_collectors/common_crawler/data/urls.csv +++ /dev/null @@ -1,207 +0,0 @@ -Index,Search Term,Keyword,Page,URL -CC-MAIN-2023-50,*.gov,police,2,https://acworth-ga.gov/administering-the-oath-of-office-to-a-newly-promoted-member-of-the-police-department/ -CC-MAIN-2023-50,*.gov,police,2,https://www.ada.gov/policevideo/policebroadbandgallery.htm -CC-MAIN-2023-50,*.gov,police,2,https://archive.ada.gov/franklintonpolice.htm -CC-MAIN-2023-50,*.gov,police,2,https://archive.ada.gov/illinois_state_police.htm -CC-MAIN-2023-50,*.gov,police,2,https://www.adamn.gov/p/other/police-department -CC-MAIN-2023-50,*.gov,police,2,https://www.adamscountypa.gov/police/earpd -CC-MAIN-2023-50,*.gov,police,2,https://www.aftonwyoming.gov/government/police_department/index.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/community_relations.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/community_relations.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/crime_snapshot_statistics.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/crime_snapshot_statistics.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/index.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/index.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/investigative_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/investigative_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/procedures.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/procedures.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/recruiting/index.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/recruiting/index.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/services_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/services_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/transparency_hub.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/transparency_hub.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/uniform_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/uniform_subdivision.php -CC-MAIN-2023-50,*.gov,police,6,https://www.akronohio.gov/departments/police/zone_command.php -CC-MAIN-2023-50,*.gov,police,6,https://akronohio.gov/departments/police/zone_command.php -CC-MAIN-2023-50,*.gov,police,6,https://adeca.alabama.gov/2022/11/14/gov-ivey-announces-grant-to-help-auburn-police-deter-crime/ -CC-MAIN-2023-50,*.gov,police,7,https://governor.alabama.gov/newsroom/2020/02/kimberly-police-officer-nick-orear-flag-memo/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/de/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ko/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ko/sales-use/police-jurisdictions/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/ru/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2015-police-jurisdiction-annexations-deannexations-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2022-police-jurisdiction-annexations-deannexations-and-ordinances/ -CC-MAIN-2023-50,*.gov,police,7,https://www.revenue.alabama.gov/sales-use/2023-police-jurisdiction-deannexations-ordinances-and-maps/ -CC-MAIN-2023-50,*.gov,police,8,https://tourism.alabama.gov/tag/world-police-and-fire-games/ -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/content/public/departments/police-department/community_resources_apd.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/content/public/v/237/departments/police-department/community_resources_apd.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/files/sharedassets/public/alameda/police/policy-manual.pdf -CC-MAIN-2023-50,*.gov,police,8,http://alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf -CC-MAIN-2023-50,*.gov,police,8,https://alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamedaca.gov/sites/default/files/department-files/2016-02-02/alameda_police_department_alpr_policy_20160122.pdf -CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/departments/police/ -CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/news/stories/peace-officers-memorial-day-and-national-police-week/ -CC-MAIN-2023-50,*.gov,police,8,https://www.alamoheightstx.gov/public-safety/police/police-blotter/ -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/airport-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/airport-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/business/policefire/index.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/business/policefire/jobs/ -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/contact-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/contact-police-fire.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police-fire-organization-chart.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police-fire-organization-chart.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/anc/police.shtml -CC-MAIN-2023-50,*.gov,police,9,https://dot.alaska.gov/faiiap/police-fire/index.shtml -CC-MAIN-2023-50,*.gov,police,10,https://gov.alaska.gov/a-proclamation-on-honoring-united-states-capitol-police-officers/ -CC-MAIN-2023-50,*.gov,police,10,https://geohub.albanyga.gov/datasets/corrected-police-beat -CC-MAIN-2023-50,*.gov,police,10,https://data.albanyny.gov/browse?tags=police+report -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/departments/police/contact-the-albany-police-department -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/departments/police/programs/medication-and-sharps-disposal -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/hr/salary-schedules/police-table -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/apba/scholarship_packet.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/a18_alarm_user_permit_application.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/secondhand_dealer.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/forms/Solicitor_License.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/neighborhood-watch/2013_nw_brochure-update.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/property/propertyinventoryrecord-fillable.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/child_safety_smartcard.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/facebook_smart_card.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/linkedln_smart_card.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/photosharingservices_smartcard.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/smartphone_smartcard.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/images/stories/police/sm-smartcards/twitter_smart_card.pdf -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/ -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/accreditation -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/accreditation -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/administration -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/apd-policies -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/apd-policies -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/communications-section -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/communications-section -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/community-resource-unit-cru -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/history -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/operations -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/operations -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/quarterly-report -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/quarterly-report -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/records-section -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/about/support-division -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/contact-apd -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/cold-cases -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/crime/statistics-crime-analysis -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/2dogs -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/2dogs -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/apbascholarship -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/apbascholarship -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/filing-a-complaint -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/filing-a-complaint -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/home-security-alarm-permits -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/home-security-alarm-permits -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/patch-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/patch-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/property-inventory-record -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/ride-along-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/forms/ride-along-requests -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/animal-control -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/apba -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/community-police-academy -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/medication-and-sharps-disposal -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/programs/national-night-out -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/neighborhood-speed-watch -CC-MAIN-2023-50,*.gov,police,10,https://albanyoregon.gov/police/programs/neighborhood-speed-watch -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/neighborhood-watch-program -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/resources -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/resources -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safe-and-secure-seniors-independent -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safe-and-secure-seniors-independent -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safereturn -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safety-camp -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/safety-camp -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/tow -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/tow -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/victim-assistance -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/victim-assistance -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/programs/youthacademy -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/qrcode -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/robots.txt -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/bicycle-theft-prevention-and-safety -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/bicycle-theft-prevention-and-safety -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/child-safety -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/crime-prevention-through-environmental-design-cpted -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/crime-prevention-through-environmental-design-cpted -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/online-social-media-safety-tips -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/protecting-your-business -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/safe-exchange-zones -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/safety-on-the-road -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/safety/vehicle -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/cadet-program -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/career-opportunities -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/lateral-officers -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/volunteer-program -CC-MAIN-2023-50,*.gov,police,10,https://www.albanyoregon.gov/police/working-at-apd/volunteer-program -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-02-22/alexandria-police-department-makes-arrest-in-connection-to-shots-fired-incident -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-03-15/alexandria-police-department-apprehends-assault-suspect -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-03-22/alexandria-police-officer-arrested -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-03-25/alexandria-police-department-investigates-first-homicide-of-the-year -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-04-18/don-hayes-appointed-alexandria-police-chief -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2022-06-06/alexandria-police-makes-arrest-in-fatal-shooting -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-08-29/alexandria-police-department-investigates-serious-crash -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2022-12-21/alexandria-police-department-investigates-shooting-incident -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2023-09-29/apd-lt-graduates-from-dc-police-leadership-academy -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/news-apd/2023-11-17/apd-assists-fairfax-county-police-in-apprehension-of-suspect-driving-stolen -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/news-apd/2023-11-17/apd-assists-fairfax-county-police-in-apprehension-of-suspect-driving-stolen -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/ -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/community-police-academy -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/criminal-investigation-division -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/listing-page/apd-news-releases -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/office-of-the-police-chief -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/other-services -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police-department/police-services -CC-MAIN-2023-50,*.gov,police,11,http://www3.alexandriava.gov/police/crime_reports/reporter.php -CC-MAIN-2023-50,*.gov,police,11,https://www3.alexandriava.gov/police/crime_reports/reporter.php -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police/info/default.aspx?id=112991 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/default.aspx?id=24274 -CC-MAIN-2023-50,*.gov,police,11,https://www.alexandriava.gov/police/info/default.aspx?id=59358 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=27648 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=33624 -CC-MAIN-2023-50,*.gov,police,11,https://alexandriava.gov/police/info/news_policedisplay.aspx?id=68136 -CC-MAIN-2023-50,*.gov,police,11,https://wdc.alexandriava.gov/employment/special-police-officer-listing-3030.aspx -CC-MAIN-2023-50,*.gov,police,11,https://wdc.alexandriava.gov/employment/special-police-officer-listing-4122.aspx -CC-MAIN-2023-50,*.gov,police,11,https://aliquippapa.gov/events/light-up-night-at-the-aliquippa-police-station/ -CC-MAIN-2023-50,*.gov,police,11,https://www.almaarkansas.gov/police/ -CC-MAIN-2023-50,*.gov,police,11,https://www.almontmichigan.gov/departments/police-department/ -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/contact-forms/departments/police/report-an-abandoned-vehicle-on-public-streets -CC-MAIN-2023-50,*.gov,police,11,https://www.altoonapa.gov/contacts/police/commander-of-criminal-investigation/lt-ashley-day -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/animal-control -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/directory -CC-MAIN-2023-50,*.gov,police,11,https://altoonapa.gov/departments/police/services -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/police-documents/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/police-staff/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/question/how-do-i-file-a-police-report-2/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/question/who-do-i-call-about-police-related-non-emergencies-2/ -CC-MAIN-2023-50,*.gov,police,11,https://alvordtx.gov/topics/police-courts/ -CC-MAIN-2023-50,*.gov,police,11,http://police.amarillo.gov/robots.txt -CC-MAIN-2023-50,*.gov,police,11,http://police.amarillo.gov/robots.txt -CC-MAIN-2023-50,*.gov,police,11,https://share.america.gov/ar/heres-police-held-accountable-shooting-incidents-video/ diff --git a/source_collectors/common_crawler/main.py b/source_collectors/common_crawler/main.py deleted file mode 100644 index 67bd4c45..00000000 --- a/source_collectors/common_crawler/main.py +++ /dev/null @@ -1,366 +0,0 @@ -import argparse -import collections -import dataclasses -import os -import re -import sys -from datetime import datetime - -from dotenv import load_dotenv - -from source_collectors.common_crawler.argparser import parse_args -from source_collectors.common_crawler.cache import CommonCrawlerCacheManager -from source_collectors.common_crawler.crawler import CommonCrawlResult, CommonCrawlerManager -from source_collectors.common_crawler.csv_manager import CSVManager - -# The below code sets the working directory to be the root of the entire repository -# This is done to solve otherwise quite annoying import issues. -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) - -from util.huggingface_api_manager import HuggingFaceAPIManager -from util.miscellaneous_functions import get_filename_friendly_timestamp -from label_studio_interface.LabelStudioConfig import LabelStudioConfig -from label_studio_interface.LabelStudioAPIManager import LabelStudioAPIManager - -""" -This module contains the main function for the Common Crawler script. -""" - - -@dataclasses.dataclass -class BatchInfo: - """ - Dataclass for batch info - """ - datetime: str - source: str - count: str - keywords: str - notes: str - filename: str - - -class LabelStudioError(Exception): - """Custom exception for Label Studio Errors""" - - pass - - -BATCH_HEADERS = ["Datetime", "Source", "Count", "Keywords", "Notes", "Filename"] - - -def get_current_time(): - """ - Returns the current time - """ - return str(datetime.now()) - - -def add_batch_info_to_csv( - common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int -) -> BatchInfo: - """ - Adds batch info to CSV - """ - batch_info = BatchInfo( - datetime=get_current_time(), - source="Common Crawl", - count=str(len(common_crawl_result.url_results)), - keywords=f"{args.url} - {args.keyword}", - notes=f"{args.common_crawl_id}, {args.pages} pages, starting at {last_page + 1}", - filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}", - ) - - batch_info_csv_manager = CSVManager( - file_name="batch_info", directory=args.data_dir, headers=BATCH_HEADERS - ) - batch_info_csv_manager.add_row(dataclasses.astuple(batch_info)) - - return batch_info - - -def main(): - """ - Main function - """ - # Parse the arguments - args = parse_args() - - # Initialize the Cache - cache_manager = CommonCrawlerCacheManager( - file_name=args.cache_filename, directory=args.data_dir - ) - - load_dotenv() - - # Initialize the HuggingFace API Manager - hf_access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN") - if not hf_access_token: - raise ValueError( - "HUGGINGFACE_ACCESS_TOKEN not accessible in .env file in root directory. " - "Please obtain access token from your personal account at " - "https://huggingface.co/settings/tokens and ensure you have write access to " - "https://huggingface.co/PDAP. Then include in .env file in root directory." - ) - huggingface_api_manager = HuggingFaceAPIManager( - access_token=hf_access_token, repo_id=args.huggingface_repo_id - ) - ls_access_token = os.getenv("LABEL_STUDIO_ACCESS_TOKEN") - if not ls_access_token: - raise ValueError( - "LABEL_STUDIO_ACCESS_TOKEN not accessible in .env file in root directory. " - "Please obtain access token from your personal account at " - "https://app.heartex.com/user/account and ensure you have read access to " - "https://app.heartex.com/projects/61550. Then include in .env file in root directory." - ) - ls_project_id = os.getenv("LABEL_STUDIO_PROJECT_ID") - if not ls_project_id: - raise ValueError( - "LABEL_STUDIO_PROJECT_ID not accessible in .env file in root directory. " - "Please obtain a project ID by navigating to the Label Studio project " - "where it will be visibile in the url. Then include in .env file in root directory." - ) - - try: - print("Retrieving Label Studio data for deduplication") - label_studio_results = get_ls_data() - if label_studio_results is None: - raise LabelStudioError("Failed to retrieve Label Studio Data") - print("Label Studio data retrieved successfully") - except LabelStudioError as e: - print(e) - raise - - if args.reset_cache: - cache_manager.reset_cache() - - try: - # Retrieve the last page from the cache, or 0 if it does not exist - last_page = cache_manager.get(args.common_crawl_id, args.url, args.keyword) - common_crawl_result = process_crawl_and_upload( - args, last_page, huggingface_api_manager, label_studio_results - ) - except ValueError as e: - print(f"Error during crawling: {e}") - return - - try: - cache_manager.upsert( - index=args.common_crawl_id, - url=args.url, - keyword=args.keyword, - last_page=common_crawl_result.last_page_search, - ) - cache_manager.save_cache() - - except ValueError as e: - print(f"Error while saving cache manager: {e}") - - -def handle_remote_results_error(remote_results): - """ - Handles errors in the remote results - - Args: remote_results (dict): The results from the label studio project - Raises: LabelStudioError: If an error is found in the remote results - """ - - status_code = remote_results.get("status_code") - if status_code == 401: - raise LabelStudioError("Invalid Label Studio token passed! Exiting...") - elif status_code == 404: - raise LabelStudioError("Invalid Label Studio Project ID! Exiting...") - else: - raise LabelStudioError(f"Unexpected error: {remote_results}") - - -def validate_remote_results(remote_results): - """ - Validates the remote results retrieved from the Label Studio project - - Args: remote_results (dict or list): The results from the Label Studio project - - Returns: - list[dict]: If the remote results are valid - None: If the remote results are invalid - """ - if isinstance(remote_results, list): - if not remote_results: - print("No data in Label Studio project.") - return [] - elif "url" not in remote_results[0]["data"]: - raise LabelStudioError( - "Column 'url' not present in Label Studio project. Exiting..." - ) - else: - return remote_results - elif isinstance(remote_results, dict): - handle_remote_results_error(remote_results) - else: - raise LabelStudioError("Unexpected response type.") - - -def get_ls_data() -> list[dict] | None: - """Retrieves data from a Label Studio project to be used in deduplication of common crawl results. - - Returns: - list[dict] | None: Data from the Labels Studio project or None if the result is invalid. - """ - # Retrieve the data from the Labels Studio project - config = LabelStudioConfig() - api_manager = LabelStudioAPIManager(config) - response = api_manager.import_tasks_from_project(all_tasks=True) - remote_results = response.json() - - return validate_remote_results(remote_results) - - -def strip_url(url: str) -> str: - """Strips http(s)://www. from the beginning of a url if applicable. - - Args: - url (str): The URL to strip. - - Returns: - str: The stripped URL. - """ - result = re.search(r"^(?:https?://)?(?:www\.)?(.*)$", url).group(1) - return result - - -def remove_local_duplicates(url_results: list[str]) -> list[str]: - """Removes duplicate URLs from a list, ignoring http(s)://www. - - Args: - url_results (list[str]): List of URLs to deduplicate. - - Returns: - list[str]: List of unique URLs. - """ - stripped_url_results = [strip_url(url) for url in url_results] - unique_urls = collections.deque() - adjust = 0 - - for index, url in enumerate(stripped_url_results): - if url in unique_urls: - del url_results[index - adjust] - adjust += 1 - else: - unique_urls.appendleft(url) - - return url_results - - -def remove_remote_duplicates( - url_results: list[str], label_studio_data: list[dict] -) -> list[str]: - """Removes URLs from a list that are already present in the Label Studio project, ignoring http(s)://www. - - Args: - url_results (list[str]): List of URLs to deduplicate. - label_studio_data (list[dict]): Label Studio project data to check for duplicates. - - Returns: - list[str]: List of remaining URLs not present in the Label Studio project. - """ - try: - remote_urls = [strip_url(task["data"]["url"]) for task in label_studio_data] - except TypeError: - print( - "Invalid Label Studio credentials. Database could not be checked for duplicates." - ) - return url_results - remote_urls = set(remote_urls) - - stripped_url_results = [strip_url(url) for url in url_results] - adjust = 0 - - for index, url in enumerate(stripped_url_results): - if url in remote_urls: - del url_results[index - adjust] - adjust += 1 - - return url_results - - -def handle_csv_and_upload( - common_crawl_result: CommonCrawlResult, - huggingface_api_manager: HuggingFaceAPIManager, - args: argparse.Namespace, - last_page: int, -): - """ - Handles the CSV file and uploads it to Hugging Face repository. - Args: - common_crawl_result: The result from Common Crawl. - huggingface_api_manager: The Hugging Face API manager. - args: The command-line arguments. - last_page: last page crawled - - """ - batch_info = add_batch_info_to_csv(common_crawl_result, args, last_page) - - csv_manager = CSVManager( - file_name=batch_info.filename, headers=["url"], directory=args.data_dir - ) - csv_manager.add_rows(common_crawl_result.url_results) - huggingface_api_manager.upload_file( - local_file_path=csv_manager.file_path, - repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}", - ) - print( - f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}" - ) - csv_manager.delete_file() - - -def process_crawl_and_upload( - args: argparse.Namespace, - last_page: int, - huggingface_api_manager: HuggingFaceAPIManager, - label_studio_data: list[dict], -) -> CommonCrawlResult: - """ - Processes a crawl and uploads the results to Hugging Face. - """ - # Initialize the CommonCrawlerManager - crawler_manager = CommonCrawlerManager(args.common_crawl_id) - # Determine the pages to search, based on the last page searched - start_page = last_page + 1 - # Use the parsed arguments - common_crawl_result: CommonCrawlResult = crawler_manager.crawl( - search_term=args.url, - keyword=args.keyword, - num_pages=args.pages, - start_page=start_page, - ) - # Logic should conclude here if no results are found - if not common_crawl_result.url_results: - print("No url results found. Ceasing main execution.") - add_batch_info_to_csv(common_crawl_result, args, last_page) - return common_crawl_result - - print("Removing urls already in the database") - common_crawl_result.url_results = remove_local_duplicates( - common_crawl_result.url_results - ) - common_crawl_result.url_results = remove_remote_duplicates( - common_crawl_result.url_results, label_studio_data - ) - if not common_crawl_result.url_results: - print( - "No urls not already present in the database found. Ceasing main execution." - ) - add_batch_info_to_csv(common_crawl_result, args, last_page) - return common_crawl_result - - handle_csv_and_upload(common_crawl_result, huggingface_api_manager, args, last_page) - - return common_crawl_result - - -if __name__ == "__main__": - # Example usage: python main.py CC-MAIN-2023-50 *.gov "police" - # Usage with optional arguments: python main.py CC-MAIN-2023-50 *.gov "police" -p 2 -o police_urls.txt - print("Running Common Crawler...") - main() diff --git a/source_collectors/common_crawler/requirements_common_crawler_action.txt b/source_collectors/common_crawler/requirements_common_crawler_action.txt deleted file mode 100644 index 22823fd0..00000000 --- a/source_collectors/common_crawler/requirements_common_crawler_action.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests~=2.31.0 -python-dotenv~=1.0.1 -huggingface-hub~=0.22.2 \ No newline at end of file diff --git a/source_collectors/common_crawler/schemas.py b/source_collectors/common_crawler/schemas.py deleted file mode 100644 index 608f9632..00000000 --- a/source_collectors/common_crawler/schemas.py +++ /dev/null @@ -1,22 +0,0 @@ -from marshmallow import Schema, fields - - -class CommonCrawlerConfigSchema(Schema): - common_crawl_id = fields.String( - required=True, - description="The Common Crawl ID", - example="CC-MAIN-2022-10" - ) - url = fields.String(required=True, description="The URL to query", example="*.gov") - keyword = fields.String(required=True, description="The keyword to search in the url", example="police") - start_page = fields.Integer(required=False, description="The page to start from", example=1) - pages = fields.Integer(required=False, description="The number of pages to search", example=1) - -class CommonCrawlerOutputSchema(Schema): - urls = fields.List( - fields.String( - required=True - ), - required=True, - description="The list of URLs found in the search" - ) \ No newline at end of file diff --git a/agency_identifier/MuckrockAPIInterface.py b/source_collectors/muckrock/MuckrockAPIInterface.py similarity index 100% rename from agency_identifier/MuckrockAPIInterface.py rename to source_collectors/muckrock/MuckrockAPIInterface.py diff --git a/source_collectors/muckrock/README.md b/source_collectors/muckrock/README.md index 43bae80d..a7e75b71 100644 --- a/source_collectors/muckrock/README.md +++ b/source_collectors/muckrock/README.md @@ -4,85 +4,3 @@ This repo provides tools for searching Muckrock FOIA requests, it includes scripts for downloading data from MuckRock, generating CSV files per PDAP database requirements, and automatic labeling -## Installation - -### 1. Clone the `scrapers` repository and navigate to the `muckrock_tools` directory. - -``` -git clone git@github.com:Police-Data-Accessibility-Project/scrapers.git -cd scrapers/scrapers_library/data_portals/muckrock/muckrock_tools -``` - -### 2. Create a virtual environment. - -If you don't already have virtualenv, install the package: - -``` - -pip install virtualenv - -``` - -Then run the following command to create a virtual environment (ensure the python version is as below): - -``` - -virtualenv -p python3.12 venv - -``` - -### 3. Activate the virtual environment. - -``` - -source venv/bin/activate - -``` - -### 4. Install dependencies. - -``` - -pip install -r requirements.txt - -``` - -## Uses - -### 1. Simple Search Term - -- `muck_get.py` -- script to perform searches on MuckRock's database, by matching a search string to title of request. Search is slow due to rate limiting (cannot multi thread around it). - -### 2. Clone Muckrock database & search locally - -- scripts to clone the MuckRock foia requests collection for fast local querying (total size <2GB at present) - -- `create_foia_data_db.py` creates and populates a SQLite database (`foia_data.db`) with all MuckRock foia requests. Various errors outside the scope of this script may occur; a counter (`last_page_fetched.txt`) is created to keep track of the most recent page fetched and inserted into the database. If the program exits prematurely, simply run `create_foia_data_db.py` again to continue where you left off. A log file is created to capture errors for later reference. - -- After `foia_data.db` is created, run `search_foia_data_db.py`, which receives a search string as input and outputs a JSON file with all related FOIA requests for later processing by `generate_detailed_muckrock_csv.py`. For example, - -``` -python3 create_foia_data_db.py - -python3 search_foia_data_db.py --search_for "use of force" -``` - -produces 'use_of_force.json'. - -### 3. County Level Search - -- `get_allegheny_foias.py`, `allegheny_county_towns.txt` -- To search for any and all requests in a certain county (e.g. Allegheny in this case) you must provide a list of all municipalities contained within the county. Muckrock stores geographic info in tiers, from Federal, State, and local level. At the local level, e.g. Pittsburgh and Allegheny County are in the same tier, with no way to determine which municipalities reside within a county (without providing it yourself). - -The `get_allegheny_foias.py` script will find the jurisdiction ID for each municipality in `allegheny_county_towns.txt`, then find all completed FOIA requests for those jurisdictions. - -### 4. Generate detailed FOIA data in PDAP database format - -- `generate_detailed_muckrock_csv.py` -- Once you have a json of relevant FOIA's, run it through this script to generate a CSV that fulfills PDAP database requirements. - -### 5. ML Labeling - -- `muckrock_ml_labeler.py` -- A tool for auto labeling MuckRock sources. This script is using [fine-url-classifier](https://huggingface.co/PDAP/fine-url-classifier) to assign 1 of 36 record type labels. At present, script is expecting each source to have associated header tags, provided via `html-tag-collector/collector.py`. (TODO: For muckrock sources, `collector.py` insufficient, does not grab main text of the request) diff --git a/source_collectors/muckrock/classes/SQLiteClient.py b/source_collectors/muckrock/classes/SQLiteClient.py deleted file mode 100644 index 96a59d82..00000000 --- a/source_collectors/muckrock/classes/SQLiteClient.py +++ /dev/null @@ -1,38 +0,0 @@ -import logging -import sqlite3 - - -class SQLClientError(Exception): - pass - - -class SQLiteClient: - - def __init__(self, db_path: str) -> None: - self.conn = sqlite3.connect(db_path) - - def execute_query(self, query: str, many=None): - - try: - if many is not None: - self.conn.executemany(query, many) - else: - self.conn.execute(query) - self.conn.commit() - except sqlite3.Error as e: - print(f"SQLite error: {e}") - error_msg = f"Failed to execute query due to SQLite error: {e}" - logging.error(error_msg) - self.conn.rollback() - raise SQLClientError(error_msg) - -class SQLiteClientContextManager: - - def __init__(self, db_path: str) -> None: - self.client = SQLiteClient(db_path) - - def __enter__(self): - return self.client - - def __exit__(self, exc_type, exc_value, traceback): - self.client.conn.close() \ No newline at end of file diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py deleted file mode 100644 index b958b61c..00000000 --- a/source_collectors/muckrock/muck_get.py +++ /dev/null @@ -1,16 +0,0 @@ -""" -A straightforward standalone script for downloading data from MuckRock -and searching for it with a specific search string. -""" -from source_collectors.muckrock.classes.FOIASearcher import FOIASearcher -from source_collectors.muckrock.classes.muckrock_fetchers import FOIAFetcher -from source_collectors.muckrock.utils import save_json_file - -if __name__ == "__main__": - search_term = "use of force" - fetcher = FOIAFetcher() - searcher = FOIASearcher(fetcher=fetcher, search_term=search_term) - results = searcher.search_to_count(20) - json_out_file = search_term.replace(" ", "_") + ".json" - save_json_file(file_path=json_out_file, data=results) - print(f"List dumped into {json_out_file}") diff --git a/source_collectors/muckrock/requirements.txt b/source_collectors/muckrock/requirements.txt deleted file mode 100644 index babb4f3e..00000000 --- a/source_collectors/muckrock/requirements.txt +++ /dev/null @@ -1,30 +0,0 @@ -certifi==2024.8.30 -charset-normalizer==3.4.0 -filelock==3.16.1 -fsspec==2024.10.0 -huggingface-hub==0.26.1 -idna==3.10 -Jinja2==3.1.4 -logging==0.4.9.6 -MarkupSafe==3.0.2 -mpmath==1.3.0 -networkx==3.4.2 -numpy==2.1.2 -packaging==24.1 -pandas==2.2.3 -python-dateutil==2.9.0.post0 -pytz==2024.2 -PyYAML==6.0.2 -regex==2024.9.11 -requests==2.32.3 -safetensors==0.4.5 -setuptools==75.2.0 -six==1.16.0 -sympy==1.13.1 -tokenizers==0.20.1 -torch==2.5.0 -tqdm==4.66.5 -transformers==4.46.0 -typing_extensions==4.12.2 -tzdata==2024.2 -urllib3==2.2.3 diff --git a/tests/manual/agency_identifier/test_muckrock_api_interface.py b/tests/manual/agency_identifier/test_muckrock_api_interface.py index 2dac6bd4..e3a86ed9 100644 --- a/tests/manual/agency_identifier/test_muckrock_api_interface.py +++ b/tests/manual/agency_identifier/test_muckrock_api_interface.py @@ -1,7 +1,7 @@ import pytest from aiohttp import ClientSession -from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface +from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface @pytest.mark.asyncio diff --git a/tests/manual/unsorted/test_identifier_unit.py b/tests/manual/unsorted/test_identifier_unit.py deleted file mode 100644 index a6dcc1fb..00000000 --- a/tests/manual/unsorted/test_identifier_unit.py +++ /dev/null @@ -1,275 +0,0 @@ -import tempfile -from unittest.mock import patch - -import pytest -import requests_mock - -from agency_identifier.identifier import * - - -@pytest.fixture -def mock_env(monkeypatch): - monkeypatch.setenv("VUE_APP_PDAP_API_KEY", "test_api_key") - - -def test_get_page_data_success(mock_env): - with requests_mock.Mocker() as m: - m.get("https://data-sources.pdap.io/api/agencies/1", json={"data": "test_data"}, status_code=200) - data = get_page_data(1) - assert data == "test_data" - - -def test_get_page_data_failure(mock_env): - with requests_mock.Mocker() as m: - m.get("https://data-sources.pdap.io/api/agencies/1", status_code=404) - with pytest.raises(Exception): - get_page_data(1) - - -@pytest.mark.parametrize("url,expected", [ - ("http://www.example.com", "example.com"), - ("https://example.com", "example.com"), - ("example.com", "example.com"), - ("www.example.com", "example.com"), -]) -def test_parse_hostname(url, expected): - assert parse_hostname(url) == expected - - -@pytest.mark.parametrize("url", [ - "http:///www.example.com", # Invalid URL - "://example.com", # Missing scheme -]) -def test_parse_hostname_failure(url): - with pytest.raises(Exception): - parse_hostname(url) - - -@pytest.mark.parametrize("url,expected", [ - ("http://www.example.com", "example.com/"), - ("https://example.com", "example.com/"), - ("http://example.com/path/to/page", "example.com/path/to/page/"), - ("www.example.com", "example.com/"), - ("example.com/", "example.com/"), -]) -def test_remove_http(url, expected): - assert remove_http(url) == expected - - -@pytest.fixture -def agencies_and_hostnames(): - return ( - [{"name": "Agency 1", "homepage_url": "https://agency1.com"}], - ["agency1.com"] - ) - - -def test_match_agencies_found(agencies_and_hostnames): - agencies, agency_hostnames = agencies_and_hostnames - match = match_agencies(agencies, agency_hostnames, "http://www.agency1.com/page") - assert match["status"] == "Match found" - assert match["agency"]["name"] == "Agency 1" - - -def test_match_agencies_no_match(agencies_and_hostnames): - agencies, agency_hostnames = agencies_and_hostnames - match = match_agencies(agencies, agency_hostnames, "http://www.nonexistentagency.com") - assert match["status"] == "No match found" - assert match["agency"] == [] - -@pytest.fixture -def agencies_with_same_hostname(): - return ( - [ - {"name": "Agency 1", "homepage_url": "http://agency.com/path1"}, - {"name": "Agency 2", "homepage_url": "http://agency.com/path2"} - ], - ["agency.com", "agency.com"] - ) - -def test_match_agencies_multiple_found(agencies_with_same_hostname): - agencies, agency_hostnames = agencies_with_same_hostname - # A URL that matches the first agency more closely - match = match_agencies(agencies, agency_hostnames, "http://agency.com/path1/page") - assert match["status"] == "Match found" - assert match["agency"]["name"] == "Agency 1" - - # A URL that doesn't closely match either agency's homepage URL path - contested_match = match_agencies(agencies, agency_hostnames, "http://agency.com/otherpath/page") - assert contested_match["status"] == "Contested match" - assert contested_match["agency"] == [] - - # A URL that matches the second agency more closely - match_second = match_agencies(agencies, agency_hostnames, "http://agency.com/path2/anotherpage") - assert match_second["status"] == "Match found" - assert match_second["agency"]["name"] == "Agency 2" - -@patch('agency_identifier.identifier.get_page_data') -def test_get_agencies_data(mock_get_page_data, mock_env): - # Mock get_page_data to return a dictionary on the first call and an empty dictionary on the second call - mock_get_page_data.side_effect = [ - [{"name": "Agency 1", "homepage_url": "https://agency1.com", "id": "1"}], # First page data - [] # Indicates no more pages - ] - - df = get_agencies_data() - assert not df.is_empty() - assert len(df) == 1 - assert df["name"][0] == "Agency 1" - assert df["homepage_url"][0] == "https://agency1.com" - - -# Sample data to simulate what `match_urls_to_agencies_and_clean_data` might return -sample_agencies_data = polars.DataFrame({ - "url": ["http://agency1.com", "http://agency2.com", "http://nonexistentagency.com"], - "homepage_url": ["http://agency1.com", "http://agency2.com", None], - "hostname": ["agency1.com", "agency2.com", None], -}) - -# Sample input URLs DataFrame -sample_urls_df = polars.DataFrame({ - "url": ["http://agency1.com/page1", "http://agency2.com/page2", "http://nonexistentagency.com/page"] -}) - - -@pytest.fixture -def mock_match_urls_to_agencies_and_clean_data(): - with patch('agency_identifier.identifier.match_urls_to_agencies_and_clean_data') as mock: - mock.return_value = sample_agencies_data - yield mock - - -def test_process_data(mock_match_urls_to_agencies_and_clean_data): - processed_df = process_data(sample_urls_df) - - # Verify that the mock was called once with the sample_urls_df - mock_match_urls_to_agencies_and_clean_data.assert_called_once_with(sample_urls_df) - - # Check that the processed DataFrame has filtered out the unmatched URLs - assert len(processed_df) == 2 # Expecting only matched URLs to be present - - # Check if the 'hostname' column exists and has no null values in the result - assert "hostname" in processed_df.columns - assert processed_df.filter(polars.col("hostname").is_null()).height == 0 - - # You might also want to check specific values if necessary - assert processed_df["url"].to_list() == ["http://agency1.com", "http://agency2.com"] - - -# Sample data to simulate what `get_agencies_data` might return -sample_get_agencies_data = polars.DataFrame({ - "homepage_url": ["http://agency1.com", "http://agency2.com"], - "name": ["Agency 1", "Agency 2"], - "count_data_sources": [10, 15], - "hostname": ["agency1.com", "agency2.com"], # Assume this is added by the function -}) - - -@pytest.fixture -def mock_get_agencies_data(): - with patch('agency_identifier.identifier.get_agencies_data') as mock: - mock.return_value = sample_get_agencies_data - yield mock - - -def test_match_urls_to_agencies_and_clean_data(mock_get_agencies_data): - matched_df = match_urls_to_agencies_and_clean_data(sample_urls_df) - - # Verify that `get_agencies_data` was called - mock_get_agencies_data.assert_called_once() - - # Verify the structure and content of the matched DataFrame - # Expect that each URL is matched with the correct agency based on the hostname - # Additionally, check for the addition of any new columns or transformations you apply - assert "homepage_url" in matched_df.columns - assert len(matched_df) == len(sample_urls_df) # Ensure all URLs are processed - - # Verify that URLs are correctly matched or not matched to agencies - # This assumes that the function annotates the DataFrame with match results - assert matched_df.filter(polars.col("url") == "http://agency1.com/page1").select("name")["name"][0] == "Agency 1" - assert matched_df.filter(polars.col("url") == "http://nonexistentagency.com/page").select("name")["name"][0] == "" - - -def test_read_data_success(): - # Create a temporary file with some CSV content - with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmp: - tmp.write("column1,column2\nvalue1,value2") - tmp_path = tmp.name - - # Attempt to read the file with read_data - try: - df = read_data(tmp_path) - assert not df.is_empty() - assert "column1" in df.columns - assert df.shape == (1, 2) - finally: - # Clean up the temporary file - os.remove(tmp_path) - -def test_read_data_failure(): - # Test reading a non-existent file should raise an exception - with pytest.raises(Exception): - read_data("non_existent_file.csv") - - -def test_write_data_success(): - # Create a DataFrame to write - df = polars.DataFrame({"column1": ["value1"], "column2": ["value2"]}) - - # Use a temporary file to write the DataFrame - with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmp: - tmp_path = tmp.name - - # Write the DataFrame and verify the file contents - try: - write_data(df, tmp_path) - - # Read back the file to verify contents - with open(tmp_path, 'r') as f: - content = f.read() - assert "column1,column2" in content - assert "value1,value2" in content - finally: - # Clean up the temporary file - os.remove(tmp_path) - - -def test_write_data_failure(monkeypatch): - # Simulate an error by patching the `write_csv` method to raise an exception - with monkeypatch.context() as m: - m.setattr(polars.DataFrame, "write_csv", - lambda self, file_path: (_ for _ in ()).throw(Exception("Mock write failure"))) - with pytest.raises(Exception) as exc_info: - df = polars.DataFrame({"column1": ["value1"], "column2": ["value2"]}) - write_data(df, "path/to/non_writable_directory/file.csv") - assert "Mock write failure" in str(exc_info.value) - -@patch('agency_identifier.identifier.write_data') -@patch('agency_identifier.identifier.process_data') -@patch('agency_identifier.identifier.read_data') -def test_process_and_write_data_success(mock_read_data, mock_process_data, mock_write_data): - # Setup mock return values - mock_read_data.return_value = polars.DataFrame({"url": ["http://example.com"]}) - processed_df = polars.DataFrame({"url": ["http://example.com"], "processed": [True]}) - mock_process_data.return_value = processed_df - - # Call the function with mocked input and output file paths - process_and_write_data("input_file.csv", "output_file.csv") - - # Verify that read_data and write_data were called correctly - mock_read_data.assert_called_once_with("input_file.csv") - mock_process_data.assert_called_once_with(mock_read_data.return_value) - mock_write_data.assert_called_once_with(processed_df, "output_file.csv") - -@pytest.mark.parametrize("side_effect,expected_exception", [ - (FileNotFoundError, FileNotFoundError), - (PermissionError, PermissionError), -]) -@patch('agency_identifier.identifier.write_data') -@patch('agency_identifier.identifier.process_data') -@patch('agency_identifier.identifier.read_data') -def test_process_and_write_data_failure(mock_read_data, mock_process_data, mock_write_data, side_effect, expected_exception): - mock_read_data.side_effect = side_effect - - with pytest.raises(expected_exception): - process_and_write_data("input_file.csv", "output_file.csv") \ No newline at end of file diff --git a/tests/test_automated/integration/api/conftest.py b/tests/test_automated/integration/api/conftest.py index ae34b28e..1dc05b44 100644 --- a/tests/test_automated/integration/api/conftest.py +++ b/tests/test_automated/integration/api/conftest.py @@ -21,7 +21,6 @@ class APITestHelper: async_core: AsyncCore db_data_creator: DBDataCreator mock_huggingface_interface: MagicMock - mock_label_studio_interface: MagicMock def adb_client(self): return self.db_data_creator.adb_client @@ -71,6 +70,5 @@ async def api_test_helper(client: TestClient, db_data_creator, monkeypatch) -> A async_core=client.app.state.async_core, db_data_creator=db_data_creator, mock_huggingface_interface=MagicMock(), - mock_label_studio_interface=MagicMock() ) await client.app.state.async_core.collector_manager.logger.clear_log_queue() diff --git a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py index 8fb9f4a5..cd9556cb 100644 --- a/tests/test_automated/integration/tasks/test_agency_preannotation_task.py +++ b/tests/test_automated/integration/tasks/test_agency_preannotation_task.py @@ -5,7 +5,7 @@ import pytest from aiohttp import ClientSession -from agency_identifier.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse +from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse from collector_db.models import Agency, AutomatedUrlAgencySuggestion from collector_manager.enums import CollectorType from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome